Merge branch 'master' of https://github.com/tqchen/xgboost into tqchen-master

Conflicts:
	README.md
This commit is contained in:
giuliohome 2014-09-05 13:24:45 +02:00
commit 909a61edac
105 changed files with 3356 additions and 10686 deletions

6
.gitignore vendored
View File

@ -6,12 +6,15 @@
# Compiled Dynamic libraries # Compiled Dynamic libraries
*.so *.so
*.dylib *.dylib
*.page
# Compiled Static libraries # Compiled Static libraries
*.lai *.lai
*.la *.la
*.a *.a
*~ *~
*.Rcheck
*.rds
*.tar.gz
*txt* *txt*
*conf *conf
*buffer *buffer
@ -40,3 +43,4 @@ Debug
*x64 *x64
*dump *dump
*save *save
*csv

View File

@ -11,7 +11,7 @@ xgboost-0.2x
* Weighted samples instances * Weighted samples instances
* Initial version of pairwise rank * Initial version of pairwise rank
xgboost-unity xgboost-0.3
===== =====
* Faster tree construction module * Faster tree construction module
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio``` - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```

View File

@ -1,32 +1,32 @@
export CC = gcc export CC = gcc
export CXX = g++ export CXX = g++
export LDFLAGS= -pthread -lm export LDFLAGS= -pthread -lm
# note for R module
# add include path to Rinternals.h here export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
ifeq ($(no_omp),1) ifeq ($(no_omp),1)
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -DDISABLE_OPENMP CFLAGS += -DDISABLE_OPENMP
else else
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp CFLAGS += -fopenmp
endif endif
# expose these flags to R CMD SHLIB
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
# specify tensor path # specify tensor path
BIN = xgboost BIN = xgboost
OBJ = OBJ = updater.o gbm.o io.o
SLIB = wrapper/libxgboostwrapper.so SLIB = wrapper/libxgboostwrapper.so
RLIB = wrapper/libxgboostR.so
.PHONY: clean all R
all: $(BIN) wrapper/libxgboostwrapper.so .PHONY: clean all python Rpack
R: wrapper/libxgboostR.so
xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp all: $(BIN) $(OBJ) $(SLIB)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part # now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
wrapper/libxgboostR.so: wrapper/xgboost_wrapper.cpp wrapper/xgboost_R.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
$(BIN) : $(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
@ -34,14 +34,31 @@ $(BIN) :
$(SLIB) : $(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(RLIB) :
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) : $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
install: install:
cp -f -r $(BIN) $(INSTALL_PATH) cp -f -r $(BIN) $(INSTALL_PATH)
Rpack:
make clean
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/inst/examples/*.buffer
rm -rf xgboost/inst/examples/*.model
rm -rf xgboost/inst/examples/dump*
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer
cp -r src xgboost/src/src
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
R CMD build xgboost
rm -rf xgboost
R CMD check --as-cran xgboost*.tar.gz
clean: clean:
$(RM) $(OBJ) $(BIN) $(SLIB) $(RLIB) *~ */*~ */*/*~ $(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~

View File

@ -1,12 +1,20 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: R wrapper of xgboost Title: eXtreme Gradient Boosting
Version: 0.3-0 Version: 0.3-1
Date: 2014-08-23 Date: 2014-08-23
Author: Tianqi Chen Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com> Maintainer: Tong He <hetong007@gmail.com>
Description: xgboost Description: This package is a R wrapper of xgboost, which is short for eXtreme
License: See LICENSE file Gradient Boosting. It is an efficient and scalable implementation of
gradient boosting framework. The package includes efficient linear model
solver and tree learning algorithms. The package can automatically do
parallel computation with OpenMP, and it can be more than 10 times faster
than existing gradient boosting packages such as gbm. It supports various
objective functions, including regression, classification and ranking. The
package is made to be extensible, so that users are also allowed to define
their own objectives easily.
License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/tqchen/xgboost URL: https://github.com/tqchen/xgboost
BugReports: https://github.com/tqchen/xgboost/issues BugReports: https://github.com/tqchen/xgboost/issues
Depends: Depends:

View File

@ -1,10 +1,15 @@
importClassesFrom("Matrix", dgCMatrix, dgeMatrix) # Generated by roxygen2 (4.0.1): do not edit by hand
export(xgboost) export(getinfo)
export(slice)
export(xgb.DMatrix) export(xgb.DMatrix)
export(xgb.getinfo) export(xgb.DMatrix.save)
exportMethods(predict)
export(xgb.train)
export(xgb.save)
export(xgb.load)
export(xgb.dump) export(xgb.dump)
export(xgb.load)
export(xgb.save)
export(xgb.train)
export(xgboost)
exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)

View File

@ -0,0 +1,38 @@
setClass('xgb.DMatrix')
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' labels <- getinfo(dtrain, "label")
#' @rdname getinfo
#' @export
#'
getinfo <- function(object, ...){
UseMethod("getinfo")
}
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param ... other parameters
#' @rdname getinfo
#' @method getinfo xgb.DMatrix
setMethod("getinfo", signature = "xgb.DMatrix",
definition = function(object, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name != "label" && name != "weight" && name != "base_margin") {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
return(ret)
})

View File

@ -1,16 +1,37 @@
#' @export
setClass("xgb.Booster") setClass("xgb.Booster")
#' Predict method for eXtreme Gradient Boosting model
#'
#' Predicted values based on xgboost model object.
#'
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
#' set it to be value bigger than 0. It will use all trees by default.
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' @export #' @export
setMethod("predict", #'
signature = "xgb.Booster", setMethod("predict", signature = "xgb.Booster",
definition = function(object, newdata, outputmargin = FALSE) definition = function(object, newdata, outputmargin = FALSE, ntreelimit = NULL) {
{
if (class(newdata) != "xgb.DMatrix") { if (class(newdata) != "xgb.DMatrix") {
newdata = xgb.DMatrix(newdata) newdata <- xgb.DMatrix(newdata)
} }
ret <- .Call("XGBoosterPredict_R", object, newdata, if (is.null(ntreelimit)) {
as.integer(outputmargin), PACKAGE="xgboost") ntreelimit <- 0
} else {
if (ntreelimit < 1){
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
return(ret) return(ret)
}) })

View File

@ -0,0 +1,33 @@
setClass('xgb.DMatrix')
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' dsub <- slice(dtrain, 1:3)
#' @rdname slice
#' @export
#'
slice <- function(object, ...){
UseMethod("slice")
}
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param ... other parameters
#' @rdname slice
#' @method slice xgb.DMatrix
setMethod("slice", signature = "xgb.DMatrix",
definition = function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
return(structure(ret, class = "xgb.DMatrix"))
})

View File

@ -1,30 +1,37 @@
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @import methods
# depends on matrix # depends on matrix
.onLoad <- function(libname, pkgname) { .onLoad <- function(libname, pkgname) {
library.dynam("xgboost", pkgname, libname); library.dynam("xgboost", pkgname, libname)
} }
.onUnload <- function(libpath) { .onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath); library.dynam.unload("xgboost", libpath)
} }
# set information into dmatrix, this mutate dmatrix # set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) { xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") { if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
} }
if (name == "label") { if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
if (name == "weight") { if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
if (name == "base_margin") { if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
if (name == "group") { if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost") .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
stop(paste("xgb.setinfo: unknown info name", name)) stop(paste("xgb.setinfo: unknown info name", name))
@ -42,16 +49,16 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
} }
} }
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
.Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost")
if (length(params) != 0) { if (length(params) != 0) {
for (i in 1:length(params)) { for (i in 1:length(params)) {
p <- params[i] p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost") .Call("XGBoosterSetParam_R", handle, names(p), as.character(p),
PACKAGE = "xgboost")
} }
} }
if (!is.null(modelfile)) { if (!is.null(modelfile)) {
if (typeof(modelfile) != "character") { if (typeof(modelfile) != "character") {
stop("xgb.Booster: modelfile must be character"); stop("xgb.Booster: modelfile must be character")
} }
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
} }
@ -67,14 +74,13 @@ xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(dmat) != "xgb.DMatrix") { if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix") stop("xgb.predict: second argument must be type xgb.DMatrix")
} }
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost") ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin),
PACKAGE = "xgboost")
return(ret) return(ret)
} }
##-------------------------------------- ## ----the following are low level iteratively function, not needed if
# the following are low level iteratively function, not needed ## you do not want to use them ---------------------------------------
# if you do not want to use them
#---------------------------------------
# iteratively update booster with dtrain # iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) { xgb.iter.update <- function(booster, dtrain, iter) {
@ -84,7 +90,8 @@ xgb.iter.update <- function(booster, dtrain, iter) {
if (class(dtrain) != "xgb.DMatrix") { if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix") stop("xgb.iter.update: second argument must be type xgb.DMatrix")
} }
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost") .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
@ -96,7 +103,8 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(dtrain) != "xgb.DMatrix") { if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix") stop("xgb.iter.update: second argument must be type xgb.DMatrix")
} }
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost") .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess,
PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
@ -113,8 +121,8 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
stop("xgb.eval: watch list can only contain xgb.DMatrix") stop("xgb.eval: watch list can only contain xgb.DMatrix")
} }
} }
evnames <- list()
if (length(watchlist) != 0) { if (length(watchlist) != 0) {
evnames <- list()
for (i in 1:length(watchlist)) { for (i in 1:length(watchlist)) {
w <- watchlist[i] w <- watchlist[i]
if (length(names(w)) == 0) { if (length(names(w)) == 0) {
@ -122,7 +130,10 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
} }
evnames <- append(evnames, names(w)) evnames <- append(evnames, names(w))
} }
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- ""
} }
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost")
return(msg) return(msg)
} }

View File

@ -1,21 +1,44 @@
# constructing DMatrix #' Contruct xgb.DMatrix object
xgb.DMatrix <- function(data, missing=0.0, ...) { #'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
#' @export
#'
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
if (typeof(data) == "character") { if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost") handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
PACKAGE = "xgboost")
} else if (is.matrix(data)) { } else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost") handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost")
} else if (class(data) == "dgCMatrix") { } else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost") handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
PACKAGE = "xgboost")
} else { } else {
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) stop(paste("xgb.DMatrix: does not support to construct from ",
typeof(data)))
} }
dmat <- structure(handle, class = "xgb.DMatrix") dmat <- structure(handle, class = "xgb.DMatrix")
info = list(...) info <- append(info, list(...))
if (length(info) == 0) if (length(info) == 0)
return(dmat) return(dmat)
for (i in 1:length(info)) { for (i in 1:length(info)) {
p = info[i] p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]]) xgb.setinfo(dmat, names(p), p[[1]])
} }
return(dmat) return(dmat)

View File

@ -0,0 +1,27 @@
#' Save xgb.DMatrix object to binary file
#'
#' Save xgb.DMatrix object to binary file
#'
#' @param DMatrix the model object.
#' @param fname the name of the binary file.
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
#' @export
#'
xgb.DMatrix.save <- function(DMatrix, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(DMatrix) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE)
}

View File

@ -1,11 +1,29 @@
# dump model #' Save xgboost model to text file
xgb.dump <- function(booster, fname, fmap = "") { #'
if (class(booster) != "xgb.Booster") { #' Save a xgboost model to text file. Could be parsed later.
#'
#' @param model the model object.
#' @param fname the name of the binary file.
#' @param fmap feature map file representing the type of feature.
#' Detailed description could be found at
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
#' Run inst/examples/demo.R for the result and inst/examples/featmap.txt
#' for example Format.
#'
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.dump(bst, 'iris.xgb.model.dump')
#' @export
#'
xgb.dump <- function(model, fname, fmap = "") {
if (class(model) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster") stop("xgb.dump: first argument must be type xgb.Booster")
} }
if (typeof(fname) != "character") { if (typeof(fname) != "character") {
stop("xgb.dump: second argument must be type character") stop("xgb.dump: second argument must be type character")
} }
.Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost") .Call("XGBoosterDumpModel_R", model, fname, fmap, PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }

View File

@ -1,16 +0,0 @@
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost")
return(ret)
}

View File

@ -1,5 +1,19 @@
#' Load xgboost model from binary file
#'
#' Load xgboost model from the binary model file
#'
#' @param modelfile the name of the binary file.
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.save(bst, 'iris.xgb.model')
#' bst <- xgb.load('iris.xgb.model')
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' @export
#'
xgb.load <- function(modelfile) { xgb.load <- function(modelfile) {
if (is.null(modelfile)) if (is.null(modelfile))
stop('xgb.load: modelfile cannot be NULL') stop("xgb.load: modelfile cannot be NULL")
xgb.Booster(modelfile = modelfile) xgb.Booster(modelfile = modelfile)
} }

View File

@ -1,16 +1,27 @@
# save model or DMatrix to file #' Save xgboost model to binary file
xgb.save <- function(handle, fname) { #'
#' Save xgboost model from xgboost or xgb.train
#'
#' @param model the model object.
#' @param fname the name of the binary file.
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.save(bst, 'iris.xgb.model')
#' bst <- xgb.load('iris.xgb.model')
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' @export
#'
xgb.save <- function(model, fname) {
if (typeof(fname) != "character") { if (typeof(fname) != "character") {
stop("xgb.save: fname must be character") stop("xgb.save: fname must be character")
} }
if (class(handle) == "xgb.Booster") { if (class(model) == "xgb.Booster") {
.Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost") .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
if (class(handle) == "xgb.DMatrix") { stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost") xgb.DMatrix object.")
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE) return(FALSE)
} }

View File

@ -1,11 +1,78 @@
# train a model using given parameters #' eXtreme Gradient Boosting Training
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) { #'
#' The training function of xgboost
#'
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max_depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
#' further details. See also inst/examples/demo.R for walkthrough example in R.
#' @param dtrain takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param watchlist what information should be printed when \code{verbose=1} or
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
#' during training. For example user can specify
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
#' the performance of each round's model on mat1 and mat2
#'
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the training function for xgboost.
#'
#' Parallelization is automatically enabled if OpenMP is present.
#' Number of threads can also be manually specified via "nthread" parameter.
#'
#' This function only accepts an \code{xgb.DMatrix} object as the input.
#' It supports advanced features such as watchlist, customized objective function,
#' therefore it is more flexible than \code{\link{xgboost}}.
#'
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' dtest <- dtrain
#' watchlist <- list(eval = dtest, train = dtrain)
#' param <- list(max_depth = 2, eta = 1, silent = 1)
#' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds))
#' grad <- preds - labels
#' hess <- preds * (1 - preds)
#' return(list(grad = grad, hess = hess))
#' }
#' evalerror <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
#' return(list(metric = "error", value = err))
#' }
#' bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
#' @export
#'
xgb.train <- function(params=list(), dtrain, nrounds, watchlist = list(),
obj = NULL, feval = NULL, ...) {
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list"); stop("xgb.train: first argument params must be list")
} }
if (class(dtrain) != "xgb.DMatrix") { if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix"); stop("xgb.train: second argument dtrain must be xgb.DMatrix")
} }
params = append(params, list(...))
bst <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.Booster(params, append(watchlist, dtrain))
for (i in 1:nrounds) { for (i in 1:nrounds) {
if (is.null(obj)) { if (is.null(obj)) {
@ -18,17 +85,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, fe
if (length(watchlist) != 0) { if (length(watchlist) != 0) {
if (is.null(feval)) { if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i - 1) msg <- xgb.iter.eval(bst, watchlist, i - 1)
cat(msg); cat("\n") cat(msg)
cat("\n")
} else { } else {
cat("["); cat(i); cat("]"); cat("[")
cat(i)
cat("]")
for (j in 1:length(watchlist)) { for (j in 1:length(watchlist)) {
w <- watchlist[j] w <- watchlist[j]
if (length(names(w)) == 0) { if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist") stop("xgb.eval: name tag must be presented for every elements in watchlist")
} }
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]]) ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); cat("\t")
cat(":"); cat(ret$value) cat(names(w))
cat("-")
cat(ret$metric)
cat(":")
cat(ret$value)
} }
cat("\n") cat("\n")
} }

View File

@ -1,49 +1,71 @@
# Main function for xgboost-package #' eXtreme Gradient Boosting (Tree) library
#'
xgboost = function(x=NULL,y=NULL,DMatrix=NULL, file=NULL, validation=NULL, #' A simple interface for xgboost in R
nrounds=10, obj=NULL, feval=NULL, margin=NULL, verbose = T, ...) #'
{ #' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
if (!is.null(DMatrix)) #' \code{xgb.DMatrix}.
dtrain = DMatrix #' @param label the response variable. User should not set this field,
else # if data is local data file or \code{xgb.DMatrix}.
{ #' @param params the list of parameters. Commonly used ones are:
if (is.null(x) && is.null(y)) #' \itemize{
{ #' \item \code{objective} objective function, common ones are
if (is.null(file)) #' \itemize{
stop('xgboost need input data, either R objects, local files or DMatrix object.') #' \item \code{reg:linear} linear regression
dtrain = xgb.DMatrix(file) #' \item \code{binary:logistic} logistic regression for classification
} #' }
else #' \item \code{eta} step size of each boosting step
dtrain = xgb.DMatrix(x, label=y) #' \item \code{max_depth} maximum depth of the tree
if (!is.null(margin)) #' \item \code{nthread} number of thread used in training, if not set, all threads are used
{ #' }
succ <- xgb.setinfo(dtrain, "base_margin", margin) #'
if (!succ) #' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
warning('Attemp to use margin failed.') #' further details. See also inst/examples/demo.R for walkthrough example in R.
} #' @param nrounds the max number of iterations
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the modeling function for xgboost.
#'
#' Parallelization is automatically enabled if OpenMP is present.
#' Number of threads can also be manually specified via "nthread" parameter
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' @export
#'
xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
verbose = 1, ...) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label))
stop("xgboost: need label when data is a matrix")
dtrain <- xgb.DMatrix(data, label = label)
} else {
if (!is.null(label))
warning("xgboost: label will be ignored.")
if (inClass == "character")
dtrain <- xgb.DMatrix(data) else if (inClass == "xgb.DMatrix")
dtrain <- data else stop("xgboost: Invalid input of data")
} }
params = list(...) if (verbose > 1) {
silent <- 0
watchlist=list() } else {
if (verbose) silent <- 1
{
if (!is.null(validation))
{
if (class(validation)!='xgb.DMatrix')
dtest = xgb.DMatrix(validation)
else
dtest = validation
watchlist = list(eval=dtest,train=dtrain)
} }
else params <- append(params, list(silent = silent))
watchlist = list(train=dtrain) params <- append(params, list(...))
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval) if (verbose > 0)
watchlist <- list(train = dtrain) else watchlist <- list()
bst <- xgb.train(params, dtrain, nrounds, watchlist)
return(bst) return(bst)
} }

View File

@ -1,10 +1,21 @@
This is subfolder for experimental version of R package. # R package for xgboost.
Not yet ready. ## Installation
Installation: For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
```r ```r
require(devtools) require(devtools)
install_github('xgboost','tqchen',subdir='R-package') install_github('xgboost','tqchen',subdir='R-package')
``` ```
For stable version on CRAN, please run
```r
install.packages('xgboost')
```
## Examples
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example.
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.

View File

@ -1,133 +0,0 @@
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm = function(fname, maxcol) {
content = readLines(fname)
nline = length(content)
label = numeric(nline)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex = as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2])
mat[i,findex] = fvalue
}
}
mat = as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
############################
# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
############################
# Directly read in local file
dtrain = xgb.DMatrix('agaricus.txt.train')
class(dtrain)
# read file in R
csc = read.libsvm("agaricus.txt.train", 126)
y = csc$label
x = csc$data
# x as Sparse Matrix
class(x)
dtrain = xgb.DMatrix(x, label=y)
# x as dense matrix
dense.x = as.matrix(x)
dtrain = xgb.DMatrix(dense.x, label=y)
############################
# Test xgboost with local file, sparse matrix and dense matrix in R.
############################
# Test with DMatrix object
bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with local file
bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with Sparse Matrix
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with dense Matrix
bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with validation set
bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test',
max_depth=2, eta=1, silent=1, objective='binary:logistic')
############################
# Test predict
############################
# Prediction with DMatrix object
dtest = xgb.DMatrix('agaricus.txt.test')
pred = predict(bst, dtest)
# Prediction with local test file
pred = predict(bst, 'agaricus.txt.test')
# Prediction with Sparse Matrix
csc = read.libsvm("agaricus.txt.test", 126)
test.y = csc$label
test.x = csc$data
pred = predict(bst, test.x)
# Extrac label with xgb.getinfo
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
############################
# Save and load model to hard disk
############################
# save model to binary local file
xgb.save(bst, 'model.save')
# load binary model to R
bst = xgb.load('model.save')
pred = predict(bst, test.x)
# save model to text file
xgb.dump(bst, 'model.dump')
############################
# Customized objective and evaluation function
############################
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
obj=logregobj, feval=evalerror)

View File

@ -1,127 +0,0 @@
# load xgboost library
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i,findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# test code here
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
# dump model
xgb.dump(bst, "dump.raw.txt")
# dump model with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save dmatrix into binary buffer
succ <- xgb.save(dtest, "dtest.buffer")
# save model into file
succ <- xgb.save(bst, "xgb.model")
# load model and data in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# assert they are the same
stopifnot(sum(abs(preds2-preds)) == 0)
###
# build dmatrix from sparseMatrix
###
print ('start running example of build DMatrix from R.sparseMatrix')
csc <- read.libsvm("agaricus.txt.train", 126)
label <- csc$label
data <- csc$data
dtrain <- xgb.DMatrix(data, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# build dmatrix from dense matrix
###
print ('start running example of build DMatrix from R.Matrix')
mat = as.matrix(data)
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py"s implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ("start running example to start from a initial prediction")
# specify parameters via map, definition are same as c++ version
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ("this is result of running from initial prediction")
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -1,103 +1,153 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
# helper function to read libsvm format # helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
# this is very badly written, load in dense, and convert to sparse # use this only for demo purpose adopted from
# use this only for demo purpose # https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r read.libsvm <- function(fname, maxcol) {
read.libsvm = function(fname, maxcol) { content <- readLines(fname)
content = readLines(fname) nline <- length(content)
nline = length(content) label <- numeric(nline)
label = numeric(nline) mat <- matrix(0, nline, maxcol + 1)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) { for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]]) arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]]) label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) { for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]] kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index # to avoid 0 index
findex = as.integer(kv[1]) + 1 findex <- as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2]) fvalue <- as.numeric(kv[2])
mat[i,findex] = fvalue mat[i, findex] <- fvalue
} }
} }
mat = as(mat, "sparseMatrix") mat <- as(mat, "sparseMatrix")
return(list(label = label, data = mat)) return(list(label = label, data = mat))
} }
# Parameter setting ############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
# Directly read in local file
dtrain <- xgb.DMatrix("agaricus.txt.train") dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test") class(dtrain)
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist = list("eval"=dtest,"train"=dtrain)
########################### # read file in R
# Train from local file csc <- read.libsvm("agaricus.txt.train", 126)
########################### y <- csc$label
x <- csc$data
# Training
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
# Prediction
pred = predict(bst, 'agaricus.txt.test')
# Performance
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
###########################
# Train from R object
###########################
csc = read.libsvm("agaricus.txt.train", 126)
y = csc$label
x = csc$data
# x as Sparse Matrix # x as Sparse Matrix
class(x) class(x)
dtrain <- xgb.DMatrix(x, label = y)
# Training # x as dense matrix
bst = xgboost(x,y,params=param,watchlist=watchlist) dense.x <- as.matrix(x)
# Prediction dtrain <- xgb.DMatrix(dense.x, label = y)
pred = predict(bst, 'agaricus.txt.test')
# Performance ############################ Test xgboost with local file, sparse matrix and dense matrix in R.
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) # Test with DMatrix object
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# Verbose = 0,1,2
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 0)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 1)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 2)
# Test with local file
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
objective = "binary:logistic")
# Test with Sparse Matrix
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# Test with dense Matrix
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
############################ Test predict
# Prediction with DMatrix object
dtest <- xgb.DMatrix("agaricus.txt.test")
pred <- predict(bst, dtest)
# Prediction with local test file
pred <- predict(bst, "agaricus.txt.test")
# Prediction with Sparse Matrix
csc <- read.libsvm("agaricus.txt.test", 126)
test.y <- csc$label
test.x <- csc$data
pred <- predict(bst, test.x)
# Extrac label with getinfo
labels <- getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=", err)) print(paste("error=", err))
# Training with dense matrix ############################ Save and load model to hard disk
x = as.matrix(x)
bst = xgboost(x,y,params=param,watchlist=watchlist)
########################### # save model to binary local file
# Train with customization xgb.save(bst, "xgboost.model")
###########################
# user define objective function, given prediction, return gradient and second order gradient # load binary model to R
# this is loglikelihood loss bst <- xgb.load("xgboost.model")
logregobj = function(preds, dtrain) { pred <- predict(bst, test.x)
labels = xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds)) # save model to text file
grad = preds - labels xgb.dump(bst, "dump.raw.txt")
hess = preds * (1.0-preds) # save model to text file, with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save a DMatrix object to hard disk
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# load a DMatrix object to R
dtrain <- xgb.DMatrix("dtrain.buffer")
############################ More flexible training function xgb.train
param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
watchlist <- list(eval = dtest, train = dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
############################ cutomsized loss function
param <- list(max_depth = 2, eta = 1, silent = 1)
# note: for customized objective function, we leave objective as default note: what we are getting is
# margin value in prediction you must know what you are doing
# user define objective function, given prediction, return gradient and second order gradient this is
# loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
# user defined evaluation function, return a list(metric="metric-name", value="metric-value") # user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
# NOTE: when you do customized loss function, the default prediction value is margin # you do customized loss function, the default prediction value is margin this may make buildin
# this may make buildin evalution metric not function properly # evalution metric not function properly for example, we are doing logistic loss, the prediction is
# for example, we are doing logistic loss, the prediction is score before logistic transformation # score before logistic transformation the buildin evaluation error assumes input is after logistic
# the buildin evaluation error assumes input is after logistic transformation # transformation Take this in mind when you use the customization, and maybe you need write customized
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # evaluation function
evalerror = function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror) # training with customized objective, we can also do step by step training simply look at xgboost.py's
# implementation of train
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
############################
# Train with previous result
############################
bst = xgboost(x,y,params=param,watchlist=watchlist)
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)

28
R-package/man/getinfo.Rd Normal file
View File

@ -0,0 +1,28 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{getinfo}
\alias{getinfo}
\alias{getinfo,xgb.DMatrix-method}
\title{Get information of an xgb.DMatrix object}
\usage{
getinfo(object, ...)
\S4method{getinfo}{xgb.DMatrix}(object, name)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{name}{the name of the field to get}
\item{...}{other parameters}
}
\description{
Get information of an xgb.DMatrix object
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
labels <- getinfo(dtrain, "label")
}

View File

@ -0,0 +1,32 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
ntreelimit = NULL)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
set it to be value bigger than 0. It will use all trees by default.}
}
\description{
Predicted values based on xgboost model object.
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
pred <- predict(bst, as.matrix(iris[,1:4]))
}

30
R-package/man/slice.Rd Normal file
View File

@ -0,0 +1,30 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{slice}
\alias{slice}
\alias{slice,xgb.DMatrix-method}
\title{Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object}
\usage{
slice(object, ...)
\S4method{slice}{xgb.DMatrix}(object, idxset, ...)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters}
}
\description{
Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
dsub <- slice(dtrain, 1:3)
}

View File

@ -0,0 +1,28 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
\usage{
xgb.DMatrix(data, info = list(), missing = 0, ...)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{...}{other information to pass to \code{info}.}
}
\description{
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
}

View File

@ -0,0 +1,23 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}
\usage{
xgb.DMatrix.save(DMatrix, fname)
}
\arguments{
\item{DMatrix}{the model object.}
\item{fname}{the name of the binary file.}
}
\description{
Save xgb.DMatrix object to binary file
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
}

27
R-package/man/xgb.dump.Rd Normal file
View File

@ -0,0 +1,27 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}
\usage{
xgb.dump(model, fname, fmap = "")
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the binary file.}
\item{fmap}{feature map file representing the type of feature.
Detailed description could be found at
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
Run inst/examples/demo.R for the result and inst/examples/featmap.txt
for example Format.}
}
\description{
Save a xgboost model to text file. Could be parsed later.
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.dump(bst, 'iris.xgb.model.dump')
}

21
R-package/man/xgb.load.Rd Normal file
View File

@ -0,0 +1,21 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}
\usage{
xgb.load(modelfile)
}
\arguments{
\item{modelfile}{the name of the binary file.}
}
\description{
Load xgboost model from the binary model file
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.save(bst, 'iris.xgb.model')
bst <- xgb.load('iris.xgb.model')
pred <- predict(bst, as.matrix(iris[,1:4]))
}

23
R-package/man/xgb.save.Rd Normal file
View File

@ -0,0 +1,23 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}
\usage{
xgb.save(model, fname)
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the binary file.}
}
\description{
Save xgboost model from xgboost or xgb.train
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.save(bst, 'iris.xgb.model')
bst <- xgb.load('iris.xgb.model')
pred <- predict(bst, as.matrix(iris[,1:4]))
}

View File

@ -0,0 +1,78 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}
\usage{
xgb.train(params = list(), dtrain, nrounds, watchlist = list(),
obj = NULL, feval = NULL, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
further details. See also inst/examples/demo.R for walkthrough example in R.}
\item{dtrain}{takes an \code{xgb.DMatrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{watchlist}{what information should be printed when \code{verbose=1} or
\code{verbose=2}. Watchlist is used to specify validation set monitoring
during training. For example user can specify
watchlist=list(validation1=mat1, validation2=mat2) to watch
the performance of each round's model on mat1 and mat2}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain,}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain,}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
The training function of xgboost
}
\details{
This is the training function for xgboost.
Parallelization is automatically enabled if OpenMP is present.
Number of threads can also be manually specified via "nthread" parameter.
This function only accepts an \code{xgb.DMatrix} object as the input.
It supports advanced features such as watchlist, customized objective function,
therefore it is more flexible than \code{\link{xgboost}}.
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
dtest <- dtrain
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max_depth = 2, eta = 1, silent = 1)
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
}

52
R-package/man/xgboost.Rd Normal file
View File

@ -0,0 +1,52 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
\usage{
xgboost(data = NULL, label = NULL, params = list(), nrounds,
verbose = 1, ...)
}
\arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,}
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
further details. See also inst/examples/demo.R for walkthrough example in R.}
\item{nrounds}{the max number of iterations}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
A simple interface for xgboost in R
}
\details{
This is the modeling function for xgboost.
Parallelization is automatically enabled if OpenMP is present.
Number of threads can also be manually specified via "nthread" parameter
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
pred <- predict(bst, as.matrix(iris[,1:4]))
}

View File

@ -1,28 +1,9 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_ # _*_ mode: Makefile; _*_
export CC = gcc PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
export CXX = g++ PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

View File

@ -1,32 +1,7 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_ # _*_ mode: Makefile; _*_
export CC = gcc PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
export CXX = g++ PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fopenmp -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
# add flag to build native code even in cross compiler
ifeq "$(WIN)" "64"
PKG_CPPFLAGS += -m64
endif
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

View File

@ -2,25 +2,54 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <cstring> #include <cstring>
#include <cstdio>
#include "xgboost_R.h" #include "xgboost_R.h"
#include "../../wrapper/xgboost_wrapper.h" #include "wrapper/xgboost_wrapper.h"
#include "../../src/utils/utils.h" #include "src/utils/utils.h"
#include "../../src/utils/omp.h" #include "src/utils/omp.h"
#include "../../src/utils/matrix_csr.h" using namespace std;
using namespace xgboost; using namespace xgboost;
extern "C" {
void XGBoostAssert_R(int exp, const char *fmt, ...);
void XGBoostCheck_R(int exp, const char *fmt, ...);
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
}
// implements error handling // implements error handling
namespace xgboost { namespace xgboost {
namespace utils { namespace utils {
void HandleAssertError(const char *msg) { extern "C" {
error("%s", msg); void (*Printf)(const char *fmt, ...) = Rprintf;
} int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
void HandleCheckError(const char *msg) { void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
error("%s", msg); void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
void (*Error)(const char *fmt, ...) = error;
} }
} // namespace utils } // namespace utils
namespace random {
void Seed(unsigned seed) {
warning("parameter seed is ignored, please set random seed using set.seed");
}
double Uniform(void) {
return unif_rand();
}
double Normal(void) {
return norm_rand();
}
} // namespace random
} // namespace xgboost } // namespace xgboost
// call before wrapper starts
inline void _WrapperBegin(void) {
GetRNGstate();
}
// call after wrapper starts
inline void _WrapperEnd(void) {
PutRNGstate();
}
extern "C" { extern "C" {
void _DMatrixFinalizer(SEXP ext) { void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return; if (R_ExternalPtrAddr(ext) == NULL) return;
@ -28,14 +57,17 @@ extern "C" {
R_ClearExternalPtr(ext); R_ClearExternalPtr(ext);
} }
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
_WrapperBegin();
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent)); void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) { SEXP missing) {
_WrapperBegin();
SEXP dim = getAttrib(mat, R_DimSymbol); SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0]; int nrow = INTEGER(dim)[0];
int ncol = INTEGER(dim)[1]; int ncol = INTEGER(dim)[1];
@ -47,55 +79,64 @@ extern "C" {
data[i * ncol +j] = din[i + nrow * j]; data[i * ncol +j] = din[i + nrow * j];
} }
} }
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing)); void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices, SEXP indices,
SEXP data) { SEXP data) {
const int *col_ptr = INTEGER(indptr); _WrapperBegin();
const int *row_index = INTEGER(indices); const int *p_indptr = INTEGER(indptr);
const double *col_data = REAL(data); const int *p_indices = INTEGER(indices);
int ncol = length(indptr) - 1; const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data); int ndata = length(data);
// transform into CSR format std::vector<bst_ulong> col_ptr_(nindptr);
std::vector<bst_ulong> row_ptr; std::vector<unsigned> indices_(ndata);
std::vector< std::pair<unsigned, float> > csr_data; std::vector<float> data_(ndata);
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget(); for (int i = 0; i < nindptr; ++i) {
for (int i = 0; i < ncol; ++i) { col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(row_index[j]);
} }
}
builder.InitStorage();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
}
}
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
std::vector<float> row_data(ndata);
std::vector<unsigned> col_index(ndata);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) { for (int i = 0; i < ndata; ++i) {
col_index[i] = csr_data[i].first; indices_[i] = static_cast<unsigned>(p_indices[i]);
row_data[i] = csr_data[i].second; data_[i] = static_cast<float>(p_data[i]);
} }
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata ); void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata);
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret;
}
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
_WrapperBegin();
int len = length(idxset);
std::vector<int> idxvec(len);
for (int i = 0; i < len; ++i) {
idxvec[i] = INTEGER(idxset)[i] - 1;
}
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
_WrapperBegin();
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent)); CHAR(asChar(fname)), asInteger(silent));
_WrapperEnd();
} }
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) { void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
_WrapperBegin();
int len = length(array); int len = length(array);
const char *name = CHAR(asChar(field)); const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) { if (!strcmp("group", name)) {
@ -104,7 +145,8 @@ extern "C" {
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]); vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
} }
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len); XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
_WrapperEnd();
return; return;
} }
{ {
@ -115,10 +157,12 @@ extern "C" {
} }
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), CHAR(asChar(field)),
&vec[0], len); BeginPtr(vec), len);
} }
_WrapperEnd();
} }
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) { SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
_WrapperBegin();
bst_ulong olen; bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen); CHAR(asChar(field)), &olen);
@ -127,6 +171,7 @@ extern "C" {
REAL(ret)[i] = res[i]; REAL(ret)[i] = res[i];
} }
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
// functions related to booster // functions related to booster
@ -136,28 +181,35 @@ extern "C" {
R_ClearExternalPtr(ext); R_ClearExternalPtr(ext);
} }
SEXP XGBoosterCreate_R(SEXP dmats) { SEXP XGBoosterCreate_R(SEXP dmats) {
_WrapperBegin();
int len = length(dmats); int len = length(dmats);
std::vector<void*> dvec; std::vector<void*> dvec;
for (int i = 0; i < len; ++i){ for (int i = 0; i < len; ++i){
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i))); dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
} }
void *handle = XGBoosterCreate(&dvec[0], dvec.size()); void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
_WrapperBegin();
XGBoosterSetParam(R_ExternalPtrAddr(handle), XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)), CHAR(asChar(name)),
CHAR(asChar(val))); CHAR(asChar(val)));
_WrapperEnd();
} }
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) { void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
_WrapperBegin();
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter), asInteger(iter),
R_ExternalPtrAddr(dtrain)); R_ExternalPtrAddr(dtrain));
_WrapperEnd();
} }
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) { void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
_WrapperBegin();
utils::Check(length(grad) == length(hess), "gradient and hess must have same length"); utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad); int len = length(grad);
std::vector<float> tgrad(len), thess(len); std::vector<float> tgrad(len), thess(len);
@ -168,9 +220,11 @@ extern "C" {
} }
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle), XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain), R_ExternalPtrAddr(dtrain),
&tgrad[0], &thess[0], len); BeginPtr(tgrad), BeginPtr(thess), len);
_WrapperEnd();
} }
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) { SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
_WrapperBegin();
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length"); utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats); int len = length(dmats);
std::vector<void*> vec_dmats; std::vector<void*> vec_dmats;
@ -185,28 +239,37 @@ extern "C" {
} }
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter), asInteger(iter),
&vec_dmats[0], &vec_sptr[0], len)); BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
_WrapperEnd();
} }
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) { SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
_WrapperBegin();
bst_ulong olen; bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle), const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat), R_ExternalPtrAddr(dmat),
asInteger(output_margin), asInteger(output_margin),
asInteger(ntree_limit),
&olen); &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen)); SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) { for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i]; REAL(ret)[i] = res[i];
} }
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) { void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
_WrapperEnd();
} }
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) { void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
_WrapperEnd();
} }
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) { void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
_WrapperBegin();
bst_ulong olen; bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)), CHAR(asChar(fmap)),
@ -217,5 +280,6 @@ extern "C" {
fprintf(fo, "%s", res[i]); fprintf(fo, "%s", res[i]);
} }
fclose(fo); fclose(fo);
_WrapperEnd();
} }
} }

View File

@ -7,6 +7,7 @@
*/ */
extern "C" { extern "C" {
#include <Rinternals.h> #include <Rinternals.h>
#include <R_ext/Random.h>
} }
extern "C" { extern "C" {
@ -36,6 +37,13 @@ extern "C" {
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices, SEXP indices,
SEXP data); SEXP data);
/*!
* \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced
* \param idxset index set
* \return a sliced new matrix
*/
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
/*! /*!
* \brief load a data matrix into binary file * \brief load a data matrix into binary file
* \param handle a instance of data matrix * \param handle a instance of data matrix
@ -99,8 +107,9 @@ extern "C" {
* \param handle handle * \param handle handle
* \param dmat data matrix * \param dmat data matrix
* \param output_margin whether only output raw margin value * \param output_margin whether only output raw margin value
* \param ntree_limit limit number of trees used in prediction
*/ */
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin); SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
/*! /*!
* \brief load model from existing file * \brief load model from existing file
* \param handle handle * \param handle handle
@ -120,5 +129,5 @@ extern "C" {
* \param fmap name to fmap can be empty string * \param fmap name to fmap can be empty string
*/ */
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap); void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
}; }
#endif // XGBOOST_WRAPPER_R_H_ #endif // XGBOOST_WRAPPER_R_H_

View File

@ -0,0 +1,33 @@
#include <stdio.h>
#include <stdarg.h>
#include <Rinternals.h>
// implements error handling
void XGBoostAssert_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("AssertError:%s\n", buf);
}
}
void XGBoostCheck_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("%s\n", buf);
}
}
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
int ret;
va_list args;
va_start(args, fmt);
ret = vsnprintf(buf, size, fmt, args);
va_end(args);
return ret;
}

View File

@ -0,0 +1,212 @@
\documentclass{article}
\RequirePackage{url}
\usepackage{hyperref}
\RequirePackage{amsmath}
\RequirePackage{natbib}
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
\makeatletter
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
%\VignettePackage{xgboost}
% \VignetteEngine{knitr::knitr}
\makeatother
\begin{document}
%\SweaveOpts{concordance=TRUE}
<<knitropts,echo=FALSE,message=FALSE>>=
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
@
%
<<prelim,echo=FALSE>>=
xgboost.version = '0.3-0'
@
%
\begin{center}
\vspace*{6\baselineskip}
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
{\large \today}\par
\vfill
\end{center}
\thispagestyle{empty}
\clearpage
\setcounter{page}{1}
\section{Introduction}
This is an introductory document of using the \verb@xgboost@ package in R.
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}.
The package includes efficient linear model solver and tree learning algorithm.
It supports various objective functions, including regression, classification
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
\begin{enumerate}
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
Windows and Linux, with openmp. It is generally over 10 times faster than
\verb@gbm@.}
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
\begin{itemize}
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
\item{Data File: }{Local data files}
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
\end{itemize}
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
and linear booster, and is optimized for sparse input.}
\item{Customization: }{\verb@xgboost@ supports customized objective function
and evaluation function}
\item{Performance: }{\verb@xgboost@ has better performance on several different
datasets.}
\end{enumerate}
\section{Example with iris}
In this section, we will illustrate some common usage of \verb@xgboost@.
<<Training and prediction with iris>>=
library(xgboost)
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
nrounds = 5)
xgb.save(bst, 'model.save')
bst = xgb.load('model.save')
pred <- predict(bst, as.matrix(iris[,1:4]))
@
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
\verb@predict@ does prediction on the model.
Here we can save the model to a binary local file, and load it when needed.
We can't inspect the trees inside. However we have another function to save the
model in plain text.
<<Dump Model>>=
xgb.dump(bst, 'model.dump')
@
The output looks like
\begin{verbatim}
booster[0]:
0:[f2<2.45] yes=1,no=2,missing=1
1:leaf=0.147059
2:[f3<1.65] yes=3,no=4,missing=3
3:leaf=0.464151
4:leaf=0.722449
booster[1]:
0:[f2<2.45] yes=1,no=2,missing=1
1:leaf=0.103806
2:[f2<4.85] yes=3,no=4,missing=3
3:leaf=0.316341
4:leaf=0.510365
\end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
It speeds up \verb@xgboost@, and is needed for advanced features such as
training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>=
iris.mat <- as.matrix(iris[,1:4])
iris.label <- as.numeric(iris[,5])
diris <- xgb.DMatrix(iris.mat, label = iris.label)
class(diris)
getinfo(diris,'label')
@
We can also save the matrix to a binary file. Then load it simply with
\verb@xgb.DMatrix@
<<save model>>=
xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
diris = xgb.DMatrix('iris.xgb.DMatrix')
@
\section{Advanced Examples}
The function \verb@xgboost@ is a simple function with less parameter, in order
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
<<Customized loss function>>=
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- sqrt(mean((preds-labels)^2))
return(list(metric = "MSE", value = err))
}
dtest <- slice(diris,1:100)
watchlist <- list(eval = dtest, train = diris)
param <- list(max_depth = 2, eta = 1, silent = 1)
bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
@
The gradient and second order gradient is required for the output of customized
objective function.
We also have \verb@slice@ for row extraction. It is useful in
cross-validation.
For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further
details.
\section{The Higgs Boson competition}
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
Boson Machine Learning Challenge}.
Here are the instructions to make a submission
\begin{enumerate}
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
and extract them to \verb@data/@.
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
The computation will take less than a minute on Intel i7.
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
and submit your result.
\end{enumerate}
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
The training set contains 350000 records and 30 features.
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
still about two times faster than \verb@gbm@.
Meanwhile, the result from \verb@xgboost@ reaches
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
single model. This results stands in the
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
competition.
\bibliographystyle{jss}
\nocite{*} % list uncited references
\bibliography{xgboost}
\end{document}

View File

@ -0,0 +1,20 @@
@article{friedman2001greedy,
title={Greedy function approximation: a gradient boosting machine},
author={Friedman, Jerome H},
journal={Annals of Statistics},
pages={1189--1232},
year={2001},
publisher={JSTOR}
}
@article{friedman2000additive,
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
journal={The annals of statistics},
volume={28},
number={2},
pages={337--407},
year={2000},
publisher={Institute of Mathematical Statistics}
}

View File

@ -1,26 +0,0 @@
This is a Fork of XGBoost from https://github.com/tqchen/xgboost
In the main repo you already find 2 windows projects for the porting of the executable and the python library.
Here you have:
1) a c# dll wrapper, meaning the passage from unmanaged to managed code, in https://github.com/giuliohome/xgboost/tree/master/windows/xgboost_sharp_wrapper
2) the c# Higgs Kaggle demo, instead of the python one (actually you will get a higher score with the c# version, due to some changes I've made) in https://github.com/giuliohome/xgboost/tree/master/windows/kaggle_higgs_demo
Start the demo from the root folder like this:
bin\x64\Debug\kaggle_higgs_demo.exe training_path.csv test_path.csv sharp_pred.csv NFoldCV NRound
NFoldCV: 0 => no cv , 5 = 5-fold-cv, 10 = 10-fold-cv :-)
3) 5 fold cv implementation in c# for the demo: you see inline cv ams while training (of course on a completely separate set)
In my latest commit I've added
4) parallel execution of n-fold cv, on top of dotnet multithreading
5) double inputted model training, stopping at a configured ams objective

27
demo/README.md Normal file
View File

@ -0,0 +1,27 @@
XGBoost Examples
====
This folder contains the all example codes using xgboost.
* Contribution of exampls, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)
Features Walkthrough
====
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
* Basic walkthrough of wrappers [python](guide-python/basic_walkthrough.py)
* Cutomize loss function, and evaluation metric [python](guide-python/custom_objective.py)
* Boosting from existing prediction [python](guide-python/boost_from_prediction.py)
* Predicting using first n trees [python](guide-python/predict_first_ntree.py)
* Generalized Linear Model [python](guide-python/generalized_linear_model.py)
* Cross validation [python](guide-python/cross_validation.py)
Basic Examples by Tasks
====
* [Binary classification](binary_classification)
* [Multiclass classification](multiclass_classification)
* [Regression](regression)
* [Learning to Rank](rank)
Benchmarks
====
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)

2
demo/data/README.md Normal file
View File

@ -0,0 +1,2 @@
This folder contains processed example dataset used by the demos.
Copyright of the dataset belongs to the original copyright holder

3
demo/guide-R/README.md Normal file
View File

@ -0,0 +1,3 @@
XGBoost R Feature Walkthrough
====
To be finished

5
demo/guide-R/runall.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
# todo
Rscript basic_walkthrough.R
Rscript custom_objective.R
Rscript boost_from_prediction.R

View File

@ -0,0 +1,8 @@
XGBoost Python Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.py)
* [Cutomize loss function, and evaluation metric](custom_objective.py)
* [Boosting from existing prediction](boost_from_prediction.py)
* [Predicting using first n trees](predict_first_ntree.py)
* [Generalized Linear Model](generalized_linear_model.py)
* [Cross validation](cross_validation.py)

View File

@ -0,0 +1,76 @@
#!/usr/bin/python
import sys
import numpy as np
import scipy.sparse
# append the path to xgboost, you may need to change the following line
# alternatively, you can add the path to PYTHONPATH environment variable
sys.path.append('../../wrapper')
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.nice.txt','../data/featmap.txt')
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2-preds)) == 0
###
# build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
labels = []
row = []; col = []; dat = []
i = 0
for l in open('../data/agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr, label = labels )
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
# we can also construct from csc matrix
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix(csc, label=labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
# then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat, label = labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )

View File

@ -0,0 +1,26 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
watchlist = [(dtest,'eval'), (dtrain,'train')]
###
# advanced: start from a initial base prediction
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# train xgboost for 1 round
bst = xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain = bst.predict(dtrain, output_margin=True)
ptest = bst.predict(dtest, output_margin=True)
dtrain.set_base_margin(ptrain)
dtest.set_base_margin(ptest)
print ('this is result of running from initial prediction')
bst = xgb.train( param, dtrain, 1, watchlist )

View File

@ -0,0 +1,63 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

View File

@ -0,0 +1,44 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
###
# advanced: cutomsized loss function
#
print ('start running example to used cutomized objective function')
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'max_depth':2, 'eta':1, 'silent':1 }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
def evalerror(preds, dtrain):
labels = dtrain.get_label()
# return a pair metric_name, result
# since preds are margin(before logistic transformation, cutoff at 0)
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)

View File

@ -0,0 +1,32 @@
#!/usr/bin/python
import sys
sys.path.append('../../wrapper')
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1 }
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1
##
# the rest of settings are the same
##
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

View File

@ -0,0 +1,22 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)
print ('start testing prediction from first n trees')
### predict using first 1 tree
label = dtest.get_label()
ypred1 = bst.predict(dtest, ntree_limit=1)
# by default, we predict using all the trees
ypred2 = bst.predict(dtest)
print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))

7
demo/guide-python/runall.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
python basic_walkthrough.py
python custom_objective.py
python boost_from_prediction.py
python generalized_linear_model.py
python cross_validation.py
rm -rf *~ *.model *.buffer

View File

@ -10,6 +10,7 @@ This script will achieve about 3.600 AMS score in public leadboard. To get start
cd ../.. cd ../..
make make
``` ```
2. Put training.csv test.csv on folder './data' (you can create a symbolic link) 2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
3. Run ./run.sh 3. Run ./run.sh
@ -21,5 +22,5 @@ speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
Using R module Using R module
===== =====
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R * Alternatively, you can run using R, higgs-train.R and higgs-pred.R.

39
demo/kaggle-higgs/higgs-cv.py Executable file
View File

@ -0,0 +1,39 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
label = train[:,32]
data = train[:,1:31]
weight = train[:,31]
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
num_round = 120
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
wtrain = dtrain.get_weight()
wtest = dtest.get_weight()
sum_weight = sum(wtrain) + sum(wtest)
wtrain *= sum_weight / sum(wtrain)
wtest *= sum_weight / sum(wtest)
dtrain.set_weight(wtrain)
dtest.set_weight(wtest)
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)

View File

@ -1,5 +1,6 @@
# include xgboost library, must set chdir=TRURE # install xgboost package, see R-package in root folder
source("../../wrapper/xgboost.R", chdir=TRUE) require(xgboost)
require(methods)
modelfile <- "higgs.model" modelfile <- "higgs.model"
outfile <- "higgs.pred.csv" outfile <- "higgs.pred.csv"
@ -8,8 +9,8 @@ data <- as.matrix(dtest[2:31])
idx <- dtest[[1]] idx <- dtest[[1]]
xgmat <- xgb.DMatrix(data, missing = -999.0) xgmat <- xgb.DMatrix(data, missing = -999.0)
bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile) bst <- xgb.load(modelfile=modelfile)
ypred <- xgb.predict(bst, xgmat) ypred <- predict(bst, xgmat)
rorder <- rank(ypred, ties.method="first") rorder <- rank(ypred, ties.method="first")

View File

@ -1,5 +1,7 @@
# include xgboost library, must set chdir=TRURE # install xgboost package, see R-package in root folder
source("../../wrapper/xgboost.R", chdir=TRUE) require(xgboost)
require(methods)
testsize <- 550000 testsize <- 550000
dtrain <- read.csv("data/training.csv", header=TRUE) dtrain <- read.csv("data/training.csv", header=TRUE)
@ -12,7 +14,7 @@ sumwpos <- sum(weight * (label==1.0))
sumwneg <- sum(weight * (label==0.0)) sumwneg <- sum(weight * (label==0.0))
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0) xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
param <- list("objective" = "binary:logitraw", param <- list("objective" = "binary:logitraw",
"scale_pos_weight" = sumwneg / sumwpos, "scale_pos_weight" = sumwneg / sumwpos,
"bst:eta" = 0.1, "bst:eta" = 0.1,

View File

@ -0,0 +1,71 @@
# install xgboost package, see R-package in root folder
require(xgboost)
require(gbm)
require(methods)
testsize <- 550000
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
# gbm.time = system.time({
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
# verbose = TRUE)
# })
# print(gbm.time)
# Test result: 761.48 secs
dtrain[33] <- dtrain[33] == "s"
label <- as.numeric(dtrain[[33]])
data <- as.matrix(dtrain[2:31])
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
sumwpos <- sum(weight * (label==1.0))
sumwneg <- sum(weight * (label==0.0))
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
xgboost.time = list()
threads = c(1,2,4,8,16)
for (i in 1:length(threads)){
thread = threads[i]
xgboost.time[[i]] = system.time({
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
param <- list("objective" = "binary:logitraw",
"scale_pos_weight" = sumwneg / sumwpos,
"bst:eta" = 0.1,
"bst:max_depth" = 6,
"eval_metric" = "auc",
"eval_metric" = "ams@0.15",
"silent" = 1,
"nthread" = thread)
watchlist <- list("train" = xgmat)
nround = 120
print ("loading data end, start to boost trees")
bst = xgb.train(param, xgmat, nround, watchlist );
# save out model
xgb.save(bst, "higgs.model")
print ('finish training')
})
}
xgboost.time
# [[1]]
# user system elapsed
# 444.98 1.96 450.22
#
# [[2]]
# user system elapsed
# 188.15 0.82 102.41
#
# [[3]]
# user system elapsed
# 143.29 0.79 44.18
#
# [[4]]
# user system elapsed
# 176.60 1.45 34.04
#
# [[5]]
# user system elapsed
# 180.15 2.85 35.26

View File

@ -13,10 +13,10 @@ Project Logical Layout
File Naming Convention File Naming Convention
======= =======
* The project is templatized, to make it easy to adjust input data structure.
* .h files are data structures and interface, which are needed to use functions in that layer. * .h files are data structures and interface, which are needed to use functions in that layer.
* -inl.hpp files are implementations of interface, like cpp file in most project. * -inl.hpp files are implementations of interface, like cpp file in most project.
- You only need to understand the interface file to understand the usage of that layer - You only need to understand the interface file to understand the usage of that layer
* In each folder, there can be a .cpp file, that compiles the module of that layer
How to Hack the Code How to Hack the Code
====== ======

View File

@ -7,16 +7,8 @@
*/ */
#include <cstdio> #include <cstdio>
#include <vector> #include <vector>
#include <limits>
#include <climits>
#include <cstring>
#include <algorithm>
#include "utils/io.h"
#include "utils/omp.h"
#include "utils/utils.h" #include "utils/utils.h"
#include "utils/iterator.h" #include "utils/iterator.h"
#include "utils/random.h"
#include "utils/matrix_csr.h"
namespace xgboost { namespace xgboost {
/*! /*!
@ -70,12 +62,12 @@ struct SparseBatch {
/*! \brief an entry of sparse vector */ /*! \brief an entry of sparse vector */
struct Entry { struct Entry {
/*! \brief feature index */ /*! \brief feature index */
bst_uint findex; bst_uint index;
/*! \brief feature value */ /*! \brief feature value */
bst_float fvalue; bst_float fvalue;
// default constructor // default constructor
Entry(void) {} Entry(void) {}
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {} Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
/*! \brief reversely compare feature values */ /*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry &a, const Entry &b) { inline static bool CmpValue(const Entry &a, const Entry &b) {
return a.fvalue < b.fvalue; return a.fvalue < b.fvalue;
@ -86,7 +78,7 @@ struct SparseBatch {
/*! \brief pointer to the elements*/ /*! \brief pointer to the elements*/
const Entry *data; const Entry *data;
/*! \brief length of the instance */ /*! \brief length of the instance */
const bst_uint length; bst_uint length;
/*! \brief constructor */ /*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {} Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/ /*! \brief get i-th pair in the sparse vector*/
@ -96,298 +88,72 @@ struct SparseBatch {
}; };
/*! \brief batch size */ /*! \brief batch size */
size_t size; size_t size;
};
/*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */ /*! \brief the offset of rowid of this batch */
size_t base_rowid; size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */ /*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr; const size_t *ind_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */ /*! \brief array[ind_ptr.back()], content of the sparse element */
const Entry *data_ptr; const Entry *data_ptr;
/*! \brief get i-th row from the batch */ /*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const { inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i])); return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
} }
}; };
/**
* \brief This is a interface convention via template, defining the way to access features,
* column access rule is defined by template, for efficiency purpose,
* row access is defined by iterator of sparse batches
* \tparam Derived type of actual implementation
*/
template<typename Derived>
class FMatrixInterface {
public:
/*! \brief example iterator over one column */
struct ColIter{
/*! /*!
* \brief move to next position * \brief read-only column batch, used to access columns,
* \return whether there is element in next position * the columns are not required to be continuous
*/ */
inline bool Next(void); struct ColBatch : public SparseBatch {
/*! \return row index of current position */ /*! \brief column index of each columns in the data */
inline bst_uint rindex(void) const; const bst_uint *col_index;
/*! \return feature value in current position */ /*! \brief pointer to the column data */
inline bst_float fvalue(void) const; const Inst *col_data;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return col_data[i];
}
}; };
/*! \brief backward iterator over column */ /**
struct ColBackIter : public ColIter {}; * \brief interface of feature matrix, needed for tree construction
* this interface defines two way to access features,
* row access is defined by iterator of RowBatch
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
*/
class IFMatrix {
public: public:
// column access is needed by some of tree construction algorithms // the interface only need to ganrantee row iter
// column iter is active, when ColIterator is called, row_iter can be disabled
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
/*!\brief get column iterator */
virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
/*! /*!
* \brief get column iterator, the columns must be sorted by feature value * \brief get the column iterator associated with FMatrix with subset of column features
* \param cidx column index * \param fset is the list of column index set that must be contained in the returning Column iterator
* \return column iterator * \return the column iterator, initialized so that it reads the elements in fset
*/ */
inline ColIter GetSortedCol(size_t cidx) const; virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param cidx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol(void) const;
/*! /*!
* \brief check if column access is supported, if not, initialize column access * \brief check if column access is supported, if not, initialize column access
* \param max_rows maximum number of rows allowed in constructor * \param subsample subsample ratio when generating column access
*/ */
inline void InitColAccess(void); virtual void InitColAccess(float subsample) = 0;
// the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
inline bool HaveColAccess(void) const; virtual bool HaveColAccess(void) const = 0;
/*! \breif return #entries-in-col */ /*! \return number of columns in the FMatrix */
inline size_t GetColSize(size_t cidx) const; virtual size_t NumCol(void) const = 0;
/*! /*! \brief get number of non-missing entries in column */
* \breif return #entries-in-col / #rows virtual size_t GetColSize(size_t cidx) const = 0;
* \param cidx column index
* this function is used to help speedup,
* doese not necessarily implement it if not sure, return 0.0;
* \return column density
*/
inline float GetColDensity(size_t cidx) const;
/*! \brief get the row iterator associated with FMatrix */
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public FMatrixInterface<FMatrixS>{
public:
typedef SparseBatch::Entry Entry;
/*! \brief row iterator */
struct ColIter{
const Entry *dptr_, *end_;
ColIter(const Entry* begin, const Entry* end)
:dptr_(begin), end_(end) {}
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
++dptr_; return true;
}
}
inline bst_uint rindex(void) const {
return dptr_->findex;
}
inline bst_float fvalue(void) const {
return dptr_->fvalue;
}
};
/*! \brief reverse column iterator */
struct ColBackIter : public ColIter {
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
// shadows ColIter::Next
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
--dptr_; return true;
}
}
};
/*! \brief constructor */
FMatrixS(void) {
iter_ = NULL;
}
// destructor
~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
inline size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
inline const std::vector<bst_uint> buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[0] + col_ptr_[cidx] - 1,
&col_data_[0] + col_ptr_[cidx + 1] - 1);
}
/*!
* \brief get reversed col iterator,
* this function will be deprecated at some point
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[0] + col_ptr_[cidx + 1],
&col_data_[0] + col_ptr_[cidx]);
}
/*! \brief get col size */
inline size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */ /*! \brief get column density */
inline float GetColDensity(size_t cidx) const { virtual float GetColDensity(size_t cidx) const = 0;
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); /*! \brief reference of buffered rowset */
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size(); virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
} // virtual destructor
inline void InitColAccess(float pkeep = 1.0f) { virtual ~IFMatrix(void){}
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
}
/*!
* \brief get the row iterator associated with FMatrix
* this function is not threadsafe, returns iterator stored in FMatrixS
*/
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
iter_->BeforeFirst();
return iter_;
}
/*! \brief set iterator */
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
this->iter_ = iter;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_);
if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<SparseBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<SparseBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].findex);
}
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop;
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].findex,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_;
}; };
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_H #endif // XGBOOST_DATA_H

View File

@ -18,13 +18,13 @@ namespace gbm {
* \brief gradient boosted linear model * \brief gradient boosted linear model
* \tparam FMatrix the data type updater taking * \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBLinear : public IGradBooster {
class GBLinear : public IGradBooster<FMatrix> {
public: public:
virtual ~GBLinear(void) { virtual ~GBLinear(void) {
} }
// set model parameters // set model parameters
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strncmp(name, "bst:", 4)) { if (!strncmp(name, "bst:", 4)) {
param.SetParam(name + 4, val); param.SetParam(name + 4, val);
} }
@ -41,13 +41,12 @@ class GBLinear : public IGradBooster<FMatrix> {
virtual void InitModel(void) { virtual void InitModel(void) {
model.InitModel(); model.InitModel();
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
this->InitFeatIndex(fmat);
std::vector<bst_gpair> &gpair = *in_gpair; std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// for all the output group // for all the output group
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0; double sum_grad = 0.0, sum_hess = 0.0;
@ -72,16 +71,20 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
} }
} }
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
// number of features // number of features
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size()); const ColBatch &batch = iter->Value();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) { for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = feat_index[i]; const bst_uint fid = batch.col_index[i];
ColBatch::Inst col = batch[i];
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0; double sum_grad = 0.0, sum_hess = 0.0;
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { for (bst_uint j = 0; j < col.length; ++j) {
const float v = it.fvalue(); const float v = col[j].fvalue;
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue; if (p.hess < 0.0f) continue;
sum_grad += p.grad * v; sum_grad += p.grad * v;
sum_hess += p.hess * v * v; sum_hess += p.hess * v * v;
@ -90,27 +93,30 @@ class GBLinear : public IGradBooster<FMatrix> {
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w)); bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
w += dw; w += dw;
// update grad value // update grad value
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue; if (p.hess < 0.0f) continue;
p.grad += p.hess * it.fvalue() * dw; p.grad += p.hess * col[j].fvalue * dw;
}
} }
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds,
unsigned ntree_limit = 0) {
utils::Check(ntree_limit == 0,
"GBLinear::Predict ntrees is only valid for gbtree predictor");
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); preds.resize(0);
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * ngroup == preds.size(), utils::Assert(batch.base_rowid * ngroup == preds.size(),
"base_rowid is not set correctly"); "base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows // output convention: nrow * k, where nrow is number of rows
@ -134,23 +140,11 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
protected: protected:
inline void InitFeatIndex(const FMatrix &fmat) { inline void Pred(const RowBatch::Inst &inst, float *preds) {
if (feat_index.size() != 0) return;
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
feat_index.reserve(ncol);
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) {
feat_index.push_back(i);
}
}
random::Shuffle(feat_index);
}
inline void Pred(const SparseBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) { for (int gid = 0; gid < model.param.num_output_group; ++gid) {
float psum = model.bias()[gid]; float psum = model.bias()[gid];
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
psum += inst[i].fvalue * model[inst[i].findex][gid]; psum += inst[i].fvalue * model[inst[i].index][gid];
} }
preds[gid] = psum; preds[gid] = psum;
} }
@ -173,6 +167,7 @@ class GBLinear : public IGradBooster<FMatrix> {
learning_rate = 1.0f; learning_rate = 1.0f;
} }
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
// sync-names // sync-names
if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val)); if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val)); if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
@ -214,9 +209,10 @@ class GBLinear : public IGradBooster<FMatrix> {
Param(void) { Param(void) {
num_feature = 0; num_feature = 0;
num_output_group = 1; num_output_group = 1;
memset(reserved, 0, sizeof(reserved)); std::memset(reserved, 0, sizeof(reserved));
} }
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val); if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val);
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val); if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
} }

19
src/gbm/gbm.cpp Normal file
View File

@ -0,0 +1,19 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstring>
#include "./gbm.h"
#include "./gbtree-inl.hpp"
#include "./gblinear-inl.hpp"
namespace xgboost {
namespace gbm {
IGradBooster* CreateGradBooster(const char *name) {
using namespace std;
if (!strcmp("gbtree", name)) return new GBTree();
if (!strcmp("gblinear", name)) return new GBLinear();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm
} // namespace xgboost

View File

@ -7,6 +7,7 @@
*/ */
#include <vector> #include <vector>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
#include "../utils/fmap.h" #include "../utils/fmap.h"
namespace xgboost { namespace xgboost {
@ -14,9 +15,7 @@ namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief interface of gradient boosting model * \brief interface of gradient boosting model
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IGradBooster { class IGradBooster {
public: public:
/*! /*!
@ -41,28 +40,31 @@ class IGradBooster {
virtual void InitModel(void) = 0; virtual void InitModel(void) = 0;
/*! /*!
* \brief peform update to the model(boosting) * \brief peform update to the model(boosting)
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info meta information about training * \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data * \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair * the booster may change content of gpair
*/ */
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) = 0; std::vector<bst_gpair> *in_gpair) = 0;
/*! /*!
* \brief generate predictions for given feature matrix * \brief generate predictions for given feature matrix
* \param fmat feature matrix * \param p_fmat feature matrix
* \param buffer_offset buffer index offset of these instances, if equals -1 * \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm * this means we do not have buffer index allocated to the gbm
* a buffer index is assigned to each instance that requires repeative prediction * a buffer index is assigned to each instance that requires repeative prediction
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size") * the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
* \param info extra side information that may be needed for prediction * \param info extra side information that may be needed for prediction
* \param out_preds output vector to hold the predictions * \param out_preds output vector to hold the predictions
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
*/ */
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) = 0; std::vector<float> *out_preds,
unsigned ntree_limit = 0) = 0;
/*! /*!
* \brief dump the model in text format * \brief dump the model in text format
* \param fmap feature map that may help give interpretations of feature * \param fmap feature map that may help give interpretations of feature
@ -73,21 +75,11 @@ class IGradBooster {
// destrcutor // destrcutor
virtual ~IGradBooster(void){} virtual ~IGradBooster(void){}
}; };
} // namespace gbm /*!
} // namespace xgboost * \breif create a gradient booster from given name
* \param name name of gradient booster
#include "gbtree-inl.hpp" */
#include "gblinear-inl.hpp" IGradBooster* CreateGradBooster(const char *name);
namespace xgboost {
namespace gbm {
template<typename FMatrix>
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm } // namespace gbm
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_GBM_GBM_H_ #endif // XGBOOST_GBM_GBM_H_

View File

@ -9,21 +9,21 @@
#include <utility> #include <utility>
#include <string> #include <string>
#include "./gbm.h" #include "./gbm.h"
#include "../utils/omp.h"
#include "../tree/updater.h" #include "../tree/updater.h"
namespace xgboost { namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief gradient boosted tree * \brief gradient boosted tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBTree : public IGradBooster {
class GBTree : public IGradBooster<FMatrix> {
public: public:
virtual ~GBTree(void) { virtual ~GBTree(void) {
this->Clear(); this->Clear();
} }
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strncmp(name, "bst:", 4)) { if (!strncmp(name, "bst:", 4)) {
cfg.push_back(std::make_pair(std::string(name+4), std::string(val))); cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
// set into updaters, if already intialized // set into updaters, if already intialized
@ -82,12 +82,12 @@ class GBTree : public IGradBooster<FMatrix> {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized"); utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized"); utils::Assert(trees.size() == 0, "GBTree: model already initialized");
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
const std::vector<bst_gpair> &gpair = *in_gpair; const std::vector<bst_gpair> &gpair = *in_gpair;
if (mparam.num_output_group == 1) { if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, fmat, info, 0); this->BoostNewTrees(gpair, p_fmat, info, 0);
} else { } else {
const int ngroup = mparam.num_output_group; const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0, utils::Check(gpair.size() % ngroup == 0,
@ -99,14 +99,15 @@ class GBTree : public IGradBooster<FMatrix> {
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid]; tmp[i] = gpair[i * ngroup + gid];
} }
this->BoostNewTrees(tmp, fmat, info, gid); this->BoostNewTrees(tmp, p_fmat, info, gid);
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds,
unsigned ntree_limit = 0) {
int nthread; int nthread;
#pragma omp parallel #pragma omp parallel
{ {
@ -118,17 +119,13 @@ class GBTree : public IGradBooster<FMatrix> {
} }
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
"base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows
// k is number of group
preds.resize(preds.size() + batch.size * mparam.num_output_group);
// parallel over local batch // parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -136,13 +133,14 @@ class GBTree : public IGradBooster<FMatrix> {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
tree::RegTree::FVec &feats = thread_temp[tid]; tree::RegTree::FVec &feats = thread_temp[tid];
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i); int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const unsigned root_idx = info.GetRoot(ridx); utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
// loop over output groups // loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) { for (int gid = 0; gid < mparam.num_output_group; ++gid) {
preds[ridx * mparam.num_output_group + gid] =
this->Pred(batch[i], this->Pred(batch[i],
buffer_offset < 0 ? -1 : buffer_offset + ridx, buffer_offset < 0 ? -1 : buffer_offset + ridx,
gid, root_idx, &feats); gid, info.GetRoot(ridx), &feats,
&preds[ridx * mparam.num_output_group + gid], stride,
ntree_limit);
} }
} }
} }
@ -174,20 +172,20 @@ class GBTree : public IGradBooster<FMatrix> {
updaters.clear(); updaters.clear();
std::string tval = tparam.updater_seq; std::string tval = tparam.updater_seq;
char *pstr; char *pstr;
pstr = strtok(&tval[0], ","); pstr = std::strtok(&tval[0], ",");
while (pstr != NULL) { while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr)); updaters.push_back(tree::CreateUpdater(pstr));
for (size_t j = 0; j < cfg.size(); ++j) { for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters // set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
} }
pstr = strtok(NULL, ","); pstr = std::strtok(NULL, ",");
} }
tparam.updater_initialized = 1; tparam.updater_initialized = 1;
} }
// do group specific group // do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair, inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
int bst_group) { int bst_group) {
this->InitUpdater(); this->InitUpdater();
@ -202,7 +200,7 @@ class GBTree : public IGradBooster<FMatrix> {
} }
// update the trees // update the trees
for (size_t i = 0; i < updaters.size(); ++i) { for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, fmat, info, new_trees); updaters[i]->Update(gpair, p_fmat, info, new_trees);
} }
// push back to model // push back to model
for (size_t i = 0; i < new_trees.size(); ++i) { for (size_t i = 0; i < new_trees.size(); ++i) {
@ -212,34 +210,53 @@ class GBTree : public IGradBooster<FMatrix> {
mparam.num_trees += tparam.num_parallel_tree; mparam.num_trees += tparam.num_parallel_tree;
} }
// make a prediction for a single instance // make a prediction for a single instance
inline float Pred(const SparseBatch::Inst &inst, inline void Pred(const RowBatch::Inst &inst,
int64_t buffer_index, int64_t buffer_index,
int bst_group, int bst_group,
unsigned root_index, unsigned root_index,
tree::RegTree::FVec *p_feats) { tree::RegTree::FVec *p_feats,
float *out_pred, size_t stride, unsigned ntree_limit) {
size_t itop = 0; size_t itop = 0;
float psum = 0.0f; float psum = 0.0f;
// sum of leaf vector
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group); const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
// number of valid trees
unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
// load buffered results if any // load buffered results if any
if (bid >= 0) { if (bid >= 0 && ntree_limit == 0) {
itop = pred_counter[bid]; itop = pred_counter[bid];
psum = pred_buffer[bid]; psum = pred_buffer[bid];
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
vec_psum[i] = pred_buffer[bid + i + 1];
}
} }
if (itop != trees.size()) { if (itop != trees.size()) {
p_feats->Fill(inst); p_feats->Fill(inst);
for (size_t i = itop; i < trees.size(); ++i) { for (size_t i = itop; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) { if (tree_info[i] == bst_group) {
psum += trees[i]->Predict(*p_feats, root_index); int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
psum += (*trees[i])[tid].leaf_value();
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
vec_psum[j] += trees[i]->leafvec(tid)[j];
}
if(--treeleft == 0) break;
} }
} }
p_feats->Drop(inst); p_feats->Drop(inst);
} }
// updated the buffered results // updated the buffered results
if (bid >= 0) { if (bid >= 0 && ntree_limit == 0) {
pred_counter[bid] = static_cast<unsigned>(trees.size()); pred_counter[bid] = static_cast<unsigned>(trees.size());
pred_buffer[bid] = psum; pred_buffer[bid] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] = vec_psum[i];
}
}
out_pred[0] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
out_pred[stride * (i + 1)] = vec_psum[i];
} }
return psum;
} }
// --- data structure --- // --- data structure ---
/*! \brief training parameters */ /*! \brief training parameters */
@ -263,6 +280,7 @@ class GBTree : public IGradBooster<FMatrix> {
updater_initialized = 0; updater_initialized = 0;
} }
inline void SetParam(const char *name, const char *val){ inline void SetParam(const char *name, const char *val){
using namespace std;
if (!strcmp(name, "updater") && if (!strcmp(name, "updater") &&
strcmp(updater_seq.c_str(), val) != 0) { strcmp(updater_seq.c_str(), val) != 0) {
updater_seq = val; updater_seq = val;
@ -292,15 +310,18 @@ class GBTree : public IGradBooster<FMatrix> {
* suppose we have n instance and k group, output will be k*n * suppose we have n instance and k group, output will be k*n
*/ */
int num_output_group; int num_output_group;
/*! \brief size of leaf vector needed in tree */
int size_leaf_vector;
/*! \brief reserved parameters */ /*! \brief reserved parameters */
int reserved[32]; int reserved[31];
/*! \brief constructor */ /*! \brief constructor */
ModelParam(void) { ModelParam(void) {
num_trees = 0; num_trees = 0;
num_roots = num_feature = 0; num_roots = num_feature = 0;
num_pbuffer = 0; num_pbuffer = 0;
num_output_group = 1; num_output_group = 1;
memset(reserved, 0, sizeof(reserved)); size_leaf_vector = 0;
std::memset(reserved, 0, sizeof(reserved));
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -308,14 +329,16 @@ class GBTree : public IGradBooster<FMatrix> {
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val); if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
if (!strcmp("num_output_group", name)) num_output_group = atol(val); if (!strcmp("num_output_group", name)) num_output_group = atol(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val); if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
} }
/*! \return size of prediction buffer actually needed */ /*! \return size of prediction buffer actually needed */
inline size_t PredBufferSize(void) const { inline size_t PredBufferSize(void) const {
return num_output_group * num_pbuffer; return num_output_group * num_pbuffer * (size_leaf_vector + 1);
} }
/*! /*!
* \brief get the buffer offset given a buffer index and group id * \brief get the buffer offset given a buffer index and group id
@ -324,7 +347,7 @@ class GBTree : public IGradBooster<FMatrix> {
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const { inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1; if (buffer_index < 0) return -1;
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer"); utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
return buffer_index + num_pbuffer * bst_group; return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
} }
}; };
// training parameter // training parameter
@ -345,7 +368,7 @@ class GBTree : public IGradBooster<FMatrix> {
// temporal storage for per thread // temporal storage for per thread
std::vector<tree::RegTree::FVec> thread_temp; std::vector<tree::RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree // the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters; std::vector<tree::IUpdater*> updaters;
}; };
} // namespace gbm } // namespace gbm

View File

@ -13,7 +13,7 @@ namespace xgboost {
/*! \brief namespace related to data format */ /*! \brief namespace related to data format */
namespace io { namespace io {
/*! \brief DMatrix object that I/O module support save/load */ /*! \brief DMatrix object that I/O module support save/load */
typedef learner::DMatrix<FMatrixS> DataMatrix; typedef learner::DMatrix DataMatrix;
/*! /*!
* \brief load DataMatrix from stream * \brief load DataMatrix from stream
* \param fname file name to be loaded * \param fname file name to be loaded

View File

@ -16,6 +16,7 @@
#include "../utils/utils.h" #include "../utils/utils.h"
#include "../learner/dmatrix.h" #include "../learner/dmatrix.h"
#include "./io.h" #include "./io.h"
#include "./simple_fmatrix-inl.hpp"
namespace xgboost { namespace xgboost {
namespace io { namespace io {
@ -24,11 +25,16 @@ class DMatrixSimple : public DataMatrix {
public: public:
// constructor // constructor
DMatrixSimple(void) : DataMatrix(kMagic) { DMatrixSimple(void) : DataMatrix(kMagic) {
this->fmat.set_iter(new OneBatchIter(this)); fmat_ = new FMatrixS(new OneBatchIter(this));
this->Clear(); this->Clear();
} }
// virtual destructor // virtual destructor
virtual ~DMatrixSimple(void) {} virtual ~DMatrixSimple(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief clear the storage */ /*! \brief clear the storage */
inline void Clear(void) { inline void Clear(void) {
row_ptr_.clear(); row_ptr_.clear();
@ -41,15 +47,17 @@ class DMatrixSimple : public DataMatrix {
this->info = src.info; this->info = src.info;
this->Clear(); this->Clear();
// clone data content in thos matrix // clone data content in thos matrix
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator(); utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) { for (size_t i = 0; i < batch.size; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
row_data_.resize(row_data_.size() + inst.length); row_data_.resize(row_data_.size() + inst.length);
memcpy(&row_data_[row_ptr_.back()], inst.data, if (inst.length != 0) {
sizeof(SparseBatch::Entry) * inst.length); std::memcpy(&row_data_[row_ptr_.back()], inst.data,
sizeof(RowBatch::Entry) * inst.length);
}
row_ptr_.push_back(row_ptr_.back() + inst.length); row_ptr_.push_back(row_ptr_.back() + inst.length);
} }
} }
@ -59,10 +67,10 @@ class DMatrixSimple : public DataMatrix {
* \param feats features * \param feats features
* \return the index of added row * \return the index of added row
*/ */
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) { inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
for (size_t i = 0; i < feats.size(); ++i) { for (size_t i = 0; i < feats.size(); ++i) {
row_data_.push_back(feats[i]); row_data_.push_back(feats[i]);
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].findex+1)); info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
} }
row_ptr_.push_back(row_ptr_.back() + feats.size()); row_ptr_.push_back(row_ptr_.back() + feats.size());
info.info.num_row += 1; info.info.num_row += 1;
@ -74,14 +82,15 @@ class DMatrixSimple : public DataMatrix {
* \param silent whether print information or not * \param silent whether print information or not
*/ */
inline void LoadText(const char* fname, bool silent = false) { inline void LoadText(const char* fname, bool silent = false) {
using namespace std;
this->Clear(); this->Clear();
FILE* file = utils::FopenCheck(fname, "r"); FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true; float label; bool init = true;
char tmp[1024]; char tmp[1024];
std::vector<SparseBatch::Entry> feats; std::vector<RowBatch::Entry> feats;
while (fscanf(file, "%s", tmp) == 1) { while (fscanf(file, "%s", tmp) == 1) {
SparseBatch::Entry e; RowBatch::Entry e;
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) { if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
feats.push_back(e); feats.push_back(e);
} else { } else {
if (!init) { if (!init) {
@ -98,8 +107,10 @@ class DMatrixSimple : public DataMatrix {
this->AddRow(feats); this->AddRow(feats);
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n", utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row(), info.num_col(), row_data_.size(), fname); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
} }
fclose(file); fclose(file);
// try to load in additional file // try to load in additional file
@ -125,7 +136,7 @@ class DMatrixSimple : public DataMatrix {
* \return whether loading is success * \return whether loading is success
*/ */
inline bool LoadBinary(const char* fname, bool silent = false) { inline bool LoadBinary(const char* fname, bool silent = false) {
FILE *fp = fopen64(fname, "rb"); std::FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false; if (fp == NULL) return false;
utils::FileStream fs(fp); utils::FileStream fs(fp);
this->LoadBinary(fs, silent, fname); this->LoadBinary(fs, silent, fname);
@ -139,24 +150,26 @@ class DMatrixSimple : public DataMatrix {
* \param fname file name, used to print message * \param fname file name, used to print message
*/ */
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
int magic; int tmagic;
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
utils::Check(magic == kMagic, "invalid format,magic number mismatch"); utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
info.LoadBinary(fs); info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
fmat.LoadColAccess(fs); fmat_->LoadColAccess(fs);
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded", utils::Printf("%lux%lu matrix with %lu entries is loaded",
info.num_row(), info.num_col(), row_data_.size()); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()));
if (fname != NULL) { if (fname != NULL) {
printf(" from %s\n", fname); utils::Printf(" from %s\n", fname);
} else { } else {
printf("\n"); utils::Printf("\n");
} }
if (info.group_ptr.size() != 0) { if (info.group_ptr.size() != 0) {
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1); utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
} }
} }
} }
@ -167,19 +180,22 @@ class DMatrixSimple : public DataMatrix {
*/ */
inline void SaveBinary(const char* fname, bool silent = false) const { inline void SaveBinary(const char* fname, bool silent = false) const {
utils::FileStream fs(utils::FopenCheck(fname, "wb")); utils::FileStream fs(utils::FopenCheck(fname, "wb"));
int magic = kMagic; int tmagic = kMagic;
fs.Write(&magic, sizeof(magic)); fs.Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fs); info.SaveBinary(fs);
FMatrixS::SaveBinary(fs, row_ptr_, row_data_); FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
fmat.SaveColAccess(fs); fmat_->SaveColAccess(fs);
fs.Close(); fs.Close();
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is saved to %s\n", utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
info.num_row(), info.num_col(), row_data_.size(), fname); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
if (info.group_ptr.size() != 0) { if (info.group_ptr.size() != 0) {
printf("data contains %lu groups\n", info.group_ptr.size()-1); utils::Printf("data contains %u groups\n",
static_cast<unsigned>(info.group_ptr.size()-1));
} }
} }
} }
@ -193,6 +209,7 @@ class DMatrixSimple : public DataMatrix {
* \param savebuffer whether do save binary buffer if it is text * \param savebuffer whether do save binary buffer if it is text
*/ */
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) { inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
using namespace std;
size_t len = strlen(fname); size_t len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) { if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
if (!this->LoadBinary(fname, silent)) { if (!this->LoadBinary(fname, silent)) {
@ -201,7 +218,7 @@ class DMatrixSimple : public DataMatrix {
return; return;
} }
char bname[1024]; char bname[1024];
snprintf(bname, sizeof(bname), "%s.buffer", fname); utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
if (!this->LoadBinary(bname, silent)) { if (!this->LoadBinary(bname, silent)) {
this->LoadText(fname, silent); this->LoadText(fname, silent);
if (savebuffer) this->SaveBinary(bname, silent); if (savebuffer) this->SaveBinary(bname, silent);
@ -211,13 +228,15 @@ class DMatrixSimple : public DataMatrix {
/*! \brief row pointer of CSR sparse storage */ /*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_; std::vector<size_t> row_ptr_;
/*! \brief data in the row */ /*! \brief data in the row */
std::vector<SparseBatch::Entry> row_data_; std::vector<RowBatch::Entry> row_data_;
/*! \brief the real fmatrix */
FMatrixS *fmat_;
/*! \brief magic number used to identify DMatrix */ /*! \brief magic number used to identify DMatrix */
static const int kMagic = 0xffffab01; static const int kMagic = 0xffffab01;
protected: protected:
// one batch iterator that return content in the matrix // one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<SparseBatch> { struct OneBatchIter: utils::IIterator<RowBatch> {
explicit OneBatchIter(DMatrixSimple *parent) explicit OneBatchIter(DMatrixSimple *parent)
: at_first_(true), parent_(parent) {} : at_first_(true), parent_(parent) {}
virtual ~OneBatchIter(void) {} virtual ~OneBatchIter(void) {}
@ -229,11 +248,11 @@ class DMatrixSimple : public DataMatrix {
at_first_ = false; at_first_ = false;
batch_.size = parent_->row_ptr_.size() - 1; batch_.size = parent_->row_ptr_.size() - 1;
batch_.base_rowid = 0; batch_.base_rowid = 0;
batch_.row_ptr = &parent_->row_ptr_[0]; batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
batch_.data_ptr = &parent_->row_data_[0]; batch_.data_ptr = BeginPtr(parent_->row_data_);
return true; return true;
} }
virtual const SparseBatch &Value(void) const { virtual const RowBatch &Value(void) const {
return batch_; return batch_;
} }
@ -243,7 +262,7 @@ class DMatrixSimple : public DataMatrix {
// pointer to parient // pointer to parient
DMatrixSimple *parent_; DMatrixSimple *parent_;
// temporal space for batch // temporal space for batch
SparseBatch batch_; RowBatch batch_;
}; };
}; };
} // namespace io } // namespace io

View File

@ -0,0 +1,242 @@
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
/*!
* \file simple_fmatrix-inl.hpp
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#include "../data.h"
#include "../utils/utils.h"
#include "../utils/random.h"
#include "../utils/omp.h"
#include "../utils/matrix_csr.h"
namespace xgboost {
namespace io {
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public IFMatrix{
public:
typedef SparseBatch::Entry Entry;
/*! \brief constructor */
FMatrixS(utils::IIterator<RowBatch> *iter) {
this->iter_ = iter;
}
// destructor
virtual ~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
virtual size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get column size */
virtual size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
virtual float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
}
/*!
* \brief get the row iterator associated with FMatrix
*/
virtual utils::IIterator<RowBatch>* RowIterator(void) {
iter_->BeforeFirst();
return iter_;
}
/*!
* \brief get the column based iterator
*/
virtual utils::IIterator<ColBatch>* ColIterator(void) {
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(ncol);
for (size_t i = 0; i < ncol; ++i) {
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
}
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
/*!
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
col_iter_.col_index_ = fset;
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_);
if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index);
}
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}
private:
// one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<ColBatch> {
OneBatchIter(void) : at_first_(true){}
virtual ~OneBatchIter(void) {}
virtual void BeforeFirst(void) {
at_first_ = true;
}
virtual bool Next(void) {
if (!at_first_) return false;
at_first_ = false;
return true;
}
virtual const ColBatch &Value(void) const {
return batch_;
}
inline void SetBatch(const std::vector<size_t> &ptr,
const std::vector<ColBatch::Entry> &data) {
batch_.size = col_index_.size();
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0));
for (size_t i = 0; i < col_data_.size(); ++i) {
const bst_uint ridx = col_index_[i];
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
}
batch_.col_index = BeginPtr(col_index_);
batch_.col_data = BeginPtr(col_data_);
this->BeforeFirst();
}
// data content
std::vector<bst_uint> col_index_;
std::vector<ColBatch::Inst> col_data_;
// whether is at first
bool at_first_;
// temporal space for batch
ColBatch batch_;
};
// --- data structure used to support InitColAccess --
// column iterator
OneBatchIter col_iter_;
// row iterator
utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<ColBatch::Entry> col_data_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP

View File

@ -7,8 +7,9 @@
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#include <vector> #include <vector>
#include <cstring>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
namespace xgboost { namespace xgboost {
namespace learner { namespace learner {
/*! /*!
@ -89,6 +90,7 @@ struct MetaInfo {
} }
// try to load group information from file, if exists // try to load group information from file, if exists
inline bool TryLoadGroup(const char* fname, bool silent = false) { inline bool TryLoadGroup(const char* fname, bool silent = false) {
using namespace std;
FILE *fi = fopen64(fname, "r"); FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false; if (fi == NULL) return false;
group_ptr.push_back(0); group_ptr.push_back(0);
@ -97,12 +99,14 @@ struct MetaInfo {
group_ptr.push_back(group_ptr.back()+nline); group_ptr.push_back(group_ptr.back()+nline);
} }
if (!silent) { if (!silent) {
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname); utils::Printf("%u groups are loaded from %s\n",
static_cast<unsigned>(group_ptr.size()-1), fname);
} }
fclose(fi); fclose(fi);
return true; return true;
} }
inline std::vector<float>& GetFloatInfo(const char *field) { inline std::vector<float>& GetFloatInfo(const char *field) {
using namespace std;
if (!strcmp(field, "label")) return labels; if (!strcmp(field, "label")) return labels;
if (!strcmp(field, "weight")) return weights; if (!strcmp(field, "weight")) return weights;
if (!strcmp(field, "base_margin")) return base_margin; if (!strcmp(field, "base_margin")) return base_margin;
@ -113,6 +117,7 @@ struct MetaInfo {
return ((MetaInfo*)this)->GetFloatInfo(field); return ((MetaInfo*)this)->GetFloatInfo(field);
} }
inline std::vector<unsigned> &GetUIntInfo(const char *field) { inline std::vector<unsigned> &GetUIntInfo(const char *field) {
using namespace std;
if (!strcmp(field, "root_index")) return info.root_index; if (!strcmp(field, "root_index")) return info.root_index;
if (!strcmp(field, "fold_index")) return info.fold_index; if (!strcmp(field, "fold_index")) return info.fold_index;
utils::Error("unknown field %s", field); utils::Error("unknown field %s", field);
@ -123,15 +128,16 @@ struct MetaInfo {
} }
// try to load weight information from file, if exists // try to load weight information from file, if exists
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) { inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
std::vector<float> &weights = this->GetFloatInfo(field); using namespace std;
std::vector<float> &data = this->GetFloatInfo(field);
FILE *fi = fopen64(fname, "r"); FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false; if (fi == NULL) return false;
float wt; float wt;
while (fscanf(fi, "%f", &wt) == 1) { while (fscanf(fi, "%f", &wt) == 1) {
weights.push_back(wt); data.push_back(wt);
} }
if (!silent) { if (!silent) {
printf("loading %s from %s\n", field, fname); utils::Printf("loading %s from %s\n", field, fname);
} }
fclose(fi); fclose(fi);
return true; return true;
@ -142,7 +148,6 @@ struct MetaInfo {
* \brief data object used for learning, * \brief data object used for learning,
* \tparam FMatrix type of feature data source * \tparam FMatrix type of feature data source
*/ */
template<typename FMatrix>
struct DMatrix { struct DMatrix {
/*! /*!
* \brief magic number associated with this object * \brief magic number associated with this object
@ -151,8 +156,6 @@ struct DMatrix {
const int magic; const int magic;
/*! \brief meta information about the dataset */ /*! \brief meta information about the dataset */
MetaInfo info; MetaInfo info;
/*! \brief feature matrix about data content */
FMatrix fmat;
/*! /*!
* \brief cache pointer to verify if the data structure is cached in some learner * \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached * used to verify if DMatrix is cached
@ -160,6 +163,8 @@ struct DMatrix {
void *cache_learner_ptr_; void *cache_learner_ptr_;
/*! \brief default constructor */ /*! \brief default constructor */
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {} explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
/*! \brief get feature matrix about data content */
virtual IFMatrix *fmat(void) const = 0;
// virtual destructor // virtual destructor
virtual ~DMatrix(void){} virtual ~DMatrix(void){}
}; };

View File

@ -8,8 +8,8 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <string> #include <string>
#include <climits>
#include <cmath> #include <cmath>
#include <climits>
#include <algorithm> #include <algorithm>
#include "./evaluation.h" #include "./evaluation.h"
#include "./helper_utils.h" #include "./helper_utils.h"
@ -24,9 +24,12 @@ template<typename Derived>
struct EvalEWiseBase : public IEvaluator { struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match"); "label and prediction size not match");
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
float sum = 0.0, wsum = 0.0; float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+: sum, wsum) schedule(static) #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) { for (bst_omp_uint i = 0; i < ndata; ++i) {
@ -99,17 +102,58 @@ struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
} }
}; };
/*! \brief ctest */
struct EvalCTest: public IEvaluator {
EvalCTest(IEvaluator *base, const char *name)
: base_(base), name_(name) {}
virtual ~EvalCTest(void) {
delete base_;
}
virtual const char *Name(void) const {
return name_.c_str();
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
size_t ngroup = preds.size() / info.labels.size() - 1;
const unsigned ndata = static_cast<unsigned>(info.labels.size());
utils::Check(ngroup > 1, "pred size does not meet requirement");
utils::Check(ndata == info.info.fold_index.size(), "need fold index");
double wsum = 0.0;
for (size_t k = 0; k < ngroup; ++k) {
std::vector<float> tpred;
MetaInfo tinfo;
for (unsigned i = 0; i < ndata; ++i) {
if (info.info.fold_index[i] == k) {
tpred.push_back(preds[i + (k + 1) * ndata]);
tinfo.labels.push_back(info.labels[i]);
tinfo.weights.push_back(info.GetWeight(i));
}
}
wsum += base_->Eval(tpred, tinfo);
}
return static_cast<float>(wsum / ngroup);
}
private:
IEvaluator *base_;
std::string name_;
};
/*! \brief AMS: also records best threshold */ /*! \brief AMS: also records best threshold */
struct EvalAMS : public IEvaluator { struct EvalAMS : public IEvaluator {
public: public:
explicit EvalAMS(const char *name) { explicit EvalAMS(const char *name) {
name_ = name; name_ = name;
// note: ams@0 will automatically select which ratio to go // note: ams@0 will automatically select which ratio to go
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format"); utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
} }
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()); using namespace std;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams"); utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata); std::vector< std::pair<float, unsigned> > rec(ndata);
@ -140,7 +184,7 @@ struct EvalAMS : public IEvaluator {
} }
} }
if (ntop == ndata) { if (ntop == ndata) {
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata); utils::Printf("\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
return static_cast<float>(tams); return static_cast<float>(tams);
} else { } else {
return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp))); return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
@ -159,6 +203,7 @@ struct EvalAMS : public IEvaluator {
struct EvalPrecisionRatio : public IEvaluator{ struct EvalPrecisionRatio : public IEvaluator{
public: public:
explicit EvalPrecisionRatio(const char *name) : name_(name) { explicit EvalPrecisionRatio(const char *name) : name_(name) {
using namespace std;
if (sscanf(name, "apratio@%f", &ratio_) == 1) { if (sscanf(name, "apratio@%f", &ratio_) == 1) {
use_ap = 1; use_ap = 1;
} else { } else {
@ -168,9 +213,11 @@ struct EvalPrecisionRatio : public IEvaluator{
} }
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Assert(preds.size() == info.labels.size(), "label size predict size not match"); utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Assert(preds.size() % info.labels.size() == 0,
"label size predict size not match");
std::vector< std::pair<float, unsigned> > rec; std::vector< std::pair<float, unsigned> > rec;
for (size_t j = 0; j < preds.size(); ++j) { for (size_t j = 0; j < info.labels.size(); ++j) {
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j))); rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
} }
std::sort(rec.begin(), rec.end(), CmpFirst); std::sort(rec.begin(), rec.end(), CmpFirst);
@ -206,10 +253,14 @@ struct EvalPrecisionRatio : public IEvaluator{
struct EvalAuc : public IEvaluator { struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), "label size predict size not match"); utils::Check(info.labels.size() != 0, "label set cannot be empty");
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size()); utils::Check(preds.size() % info.labels.size() == 0,
"label size predict size not match");
std::vector<unsigned> tgptr(2, 0);
tgptr[1] = static_cast<unsigned>(info.labels.size());
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.back() == preds.size(), utils::Check(gptr.back() == info.labels.size(),
"EvalAuc: group structure must match number of prediction"); "EvalAuc: group structure must match number of prediction");
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1); const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
// sum statictis // sum statictis
@ -293,6 +344,7 @@ struct EvalRankList : public IEvaluator {
protected: protected:
explicit EvalRankList(const char *name) { explicit EvalRankList(const char *name) {
using namespace std;
name_ = name; name_ = name;
minus_ = false; minus_ = false;
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) { if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
@ -339,7 +391,7 @@ struct EvalNDCG : public EvalRankList{
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) { for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
const unsigned rel = rec[i].second; const unsigned rel = rec[i].second;
if (rel != 0) { if (rel != 0) {
sumdcg += ((1 << rel) - 1) / log(i + 2.0); sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
} }
} }
return static_cast<float>(sumdcg); return static_cast<float>(sumdcg);

View File

@ -36,6 +36,7 @@ struct IEvaluator{
namespace xgboost { namespace xgboost {
namespace learner { namespace learner {
inline IEvaluator* CreateEvaluator(const char *name) { inline IEvaluator* CreateEvaluator(const char *name) {
using namespace std;
if (!strcmp(name, "rmse")) return new EvalRMSE(); if (!strcmp(name, "rmse")) return new EvalRMSE();
if (!strcmp(name, "error")) return new EvalError(); if (!strcmp(name, "error")) return new EvalError();
if (!strcmp(name, "merror")) return new EvalMatchError(); if (!strcmp(name, "merror")) return new EvalMatchError();
@ -45,7 +46,9 @@ inline IEvaluator* CreateEvaluator(const char *name) {
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name); if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name); if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
if (!strncmp(name, "map", 3)) return new EvalMAP(name); if (!strncmp(name, "map", 3)) return new EvalMAP(name);
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name); if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
utils::Error("unknown evaluation metric type: %s", name); utils::Error("unknown evaluation metric type: %s", name);
return NULL; return NULL;
} }
@ -54,6 +57,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
class EvalSet{ class EvalSet{
public: public:
inline void AddEval(const char *name) { inline void AddEval(const char *name) {
using namespace std;
for (size_t i = 0; i < evals_.size(); ++i) { for (size_t i = 0; i < evals_.size(); ++i) {
if (!strcmp(name, evals_[i]->Name())) return; if (!strcmp(name, evals_[i]->Name())) return;
} }
@ -71,11 +75,14 @@ class EvalSet{
for (size_t i = 0; i < evals_.size(); ++i) { for (size_t i = 0; i < evals_.size(); ++i) {
float res = evals_[i]->Eval(preds, info); float res = evals_[i]->Eval(preds, info);
char tmp[1024]; char tmp[1024];
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res); utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
result += tmp; result += tmp;
} }
return result; return result;
} }
inline size_t Size(void) const {
return evals_.size();
}
private: private:
std::vector<const IEvaluator*> evals_; std::vector<const IEvaluator*> evals_;

View File

@ -7,6 +7,7 @@
*/ */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <cmath>
#include <algorithm> #include <algorithm>
namespace xgboost { namespace xgboost {
namespace learner { namespace learner {

View File

@ -21,7 +21,6 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions * \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction * and do training and prediction
*/ */
template<typename FMatrix>
class BoostLearner { class BoostLearner {
public: public:
BoostLearner(void) { BoostLearner(void) {
@ -44,7 +43,7 @@ class BoostLearner {
* data matrices to continue training otherwise it will cause error * data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached * \param mats array of pointers to matrix whose prediction result need to be cached
*/ */
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) { inline void SetCacheData(const std::vector<DMatrix*>& mats) {
// estimate feature bound // estimate feature bound
unsigned num_feature = 0; unsigned num_feature = 0;
// assign buffer index // assign buffer index
@ -64,13 +63,14 @@ class BoostLearner {
} }
char str_temp[25]; char str_temp[25];
if (num_feature > mparam.num_feature) { if (num_feature > mparam.num_feature) {
snprintf(str_temp, sizeof(str_temp), "%u", num_feature); utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
this->SetParam("bst:num_feature", str_temp); this->SetParam("bst:num_feature", str_temp);
} }
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size); utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
static_cast<unsigned long>(buffer_size));
this->SetParam("num_pbuffer", str_temp); this->SetParam("num_pbuffer", str_temp);
if (!silent) { if (!silent) {
printf("buffer_size=%ld\n", buffer_size); utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
} }
} }
/*! /*!
@ -79,6 +79,7 @@ class BoostLearner {
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
// in this version, bst: prefix is no longer required // in this version, bst: prefix is no longer required
if (strncmp(name, "bst:", 4) != 0) { if (strncmp(name, "bst:", 4) != 0) {
std::string n = "bst:"; n += name; std::string n = "bst:"; n += name;
@ -158,18 +159,18 @@ class BoostLearner {
* if not intialize it * if not intialize it
* \param p_train pointer to the matrix used by training * \param p_train pointer to the matrix used by training
*/ */
inline void CheckInit(DMatrix<FMatrix> *p_train) { inline void CheckInit(DMatrix *p_train) {
p_train->fmat.InitColAccess(prob_buffer_row); p_train->fmat()->InitColAccess(prob_buffer_row);
} }
/*! /*!
* \brief update the model for one iteration * \brief update the model for one iteration
* \param iter current iteration number * \param iter current iteration number
* \param p_train pointer to the data matrix * \param p_train pointer to the data matrix
*/ */
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) { inline void UpdateOneIter(int iter, const DMatrix &train) {
this->PredictRaw(train, &preds_); this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_); obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(train.fmat, train.info.info, &gpair_); gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
} }
/*! /*!
* \brief evaluate the model for specific iteration * \brief evaluate the model for specific iteration
@ -179,11 +180,11 @@ class BoostLearner {
* \return a string corresponding to the evaluation result * \return a string corresponding to the evaluation result
*/ */
inline std::string EvalOneIter(int iter, inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals, const std::vector<const DMatrix*> &evals,
const std::vector<std::string> &evname) { const std::vector<std::string> &evname) {
std::string res; std::string res;
char tmp[256]; char tmp[256];
snprintf(tmp, sizeof(tmp), "[%d]", iter); utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter);
res = tmp; res = tmp;
for (size_t i = 0; i < evals.size(); ++i) { for (size_t i = 0; i < evals.size(); ++i) {
this->PredictRaw(*evals[i], &preds_); this->PredictRaw(*evals[i], &preds_);
@ -198,7 +199,7 @@ class BoostLearner {
* \param metric name of metric * \param metric name of metric
* \return a pair of <evaluation name, result> * \return a pair of <evaluation name, result>
*/ */
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) { std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric(); if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str()); IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_); this->PredictRaw(data, &preds_);
@ -212,11 +213,14 @@ class BoostLearner {
* \param data input data * \param data input data
* \param output_margin whether to only predict margin value instead of transformed prediction * \param output_margin whether to only predict margin value instead of transformed prediction
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
* \param ntree_limit limit number of trees used for boosted tree
* predictor, when it equals 0, this means we are using all the trees
*/ */
inline void Predict(const DMatrix<FMatrix> &data, inline void Predict(const DMatrix &data,
bool output_margin, bool output_margin,
std::vector<float> *out_preds) const { std::vector<float> *out_preds,
this->PredictRaw(data, out_preds); unsigned ntree_limit = 0) const {
this->PredictRaw(data, out_preds, ntree_limit);
if (!output_margin) { if (!output_margin) {
obj_->PredTransform(out_preds); obj_->PredTransform(out_preds);
} }
@ -235,22 +239,27 @@ class BoostLearner {
if (obj_ != NULL) return; if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL"); utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str()); obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str()); gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) { for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
} }
if (evaluator_.Size() == 0) {
evaluator_.AddEval(obj_->DefaultEvalMetric()); evaluator_.AddEval(obj_->DefaultEvalMetric());
} }
}
/*! /*!
* \brief get un-transformed prediction * \brief get un-transformed prediction
* \param data training data matrix * \param data training data matrix
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
* \param ntree_limit limit number of trees used for boosted tree
* predictor, when it equals 0, this means we are using all the trees
*/ */
inline void PredictRaw(const DMatrix<FMatrix> &data, inline void PredictRaw(const DMatrix &data,
std::vector<float> *out_preds) const { std::vector<float> *out_preds,
gbm_->Predict(data.fmat, this->FindBufferOffset(data), unsigned ntree_limit = 0) const {
data.info.info, out_preds); gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
data.info.info, out_preds, ntree_limit);
// add base margin // add base margin
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
@ -284,7 +293,7 @@ class BoostLearner {
base_score = 0.5f; base_score = 0.5f;
num_feature = 0; num_feature = 0;
num_class = 0; num_class = 0;
memset(reserved, 0, sizeof(reserved)); std::memset(reserved, 0, sizeof(reserved));
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -292,6 +301,7 @@ class BoostLearner {
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val)); if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
if (!strcmp("num_class", name)) num_class = atoi(val); if (!strcmp("num_class", name)) num_class = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
@ -307,7 +317,7 @@ class BoostLearner {
// model parameter // model parameter
ModelParam mparam; ModelParam mparam;
// gbm model that back everything // gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_; gbm::IGradBooster *gbm_;
// name of gbm model used for training // name of gbm model used for training
std::string name_gbm_; std::string name_gbm_;
// objective fnction // objective fnction
@ -324,14 +334,14 @@ class BoostLearner {
private: private:
// cache entry object that helps handle feature caching // cache entry object that helps handle feature caching
struct CacheEntry { struct CacheEntry {
const DMatrix<FMatrix> *mat_; const DMatrix *mat_;
size_t buffer_offset_; size_t buffer_offset_;
size_t num_row_; size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row) CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {} :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
}; };
// find internal bufer offset for certain matrix, if not exist, return -1 // find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const { inline int64_t FindBufferOffset(const DMatrix &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) { for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) { if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.info.num_row()) { if (cache_[i].num_row_ == mat.info.num_row()) {

View File

@ -6,9 +6,9 @@
* \author Tianqi Chen, Kailong Chen * \author Tianqi Chen, Kailong Chen
*/ */
#include <vector> #include <vector>
#include <cmath>
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include <cmath>
#include <functional> #include <functional>
#include "../data.h" #include "../data.h"
#include "./objective.h" #include "./objective.h"
@ -37,7 +37,7 @@ struct LossType {
case kLogisticRaw: case kLogisticRaw:
case kLinearSquare: return x; case kLinearSquare: return x;
case kLogisticClassify: case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x)); case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x));
default: utils::Error("unknown loss_type"); return 0.0f; default: utils::Error("unknown loss_type"); return 0.0f;
} }
} }
@ -50,7 +50,7 @@ struct LossType {
inline float FirstOrderGradient(float predt, float label) const { inline float FirstOrderGradient(float predt, float label) const {
switch (loss_type) { switch (loss_type) {
case kLinearSquare: return predt - label; case kLinearSquare: return predt - label;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt)); case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
case kLogisticClassify: case kLogisticClassify:
case kLogisticNeglik: return predt - label; case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f; default: utils::Error("unknown loss_type"); return 0.0f;
@ -65,7 +65,7 @@ struct LossType {
inline float SecondOrderGradient(float predt, float label) const { inline float SecondOrderGradient(float predt, float label) const {
switch (loss_type) { switch (loss_type) {
case kLinearSquare: return 1.0f; case kLinearSquare: return 1.0f;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt)); case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
case kLogisticClassify: case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt); case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f; default: utils::Error("unknown loss_type"); return 0.0f;
@ -80,7 +80,7 @@ struct LossType {
loss_type == kLogisticNeglik ) { loss_type == kLogisticNeglik ) {
utils::Check(base_score > 0.0f && base_score < 1.0f, utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss"); "base_score must be in (0,1) for logistic loss");
base_score = -logf(1.0f / base_score - 1.0f); base_score = -std::log(1.0f / base_score - 1.0f);
} }
return base_score; return base_score;
} }
@ -101,6 +101,7 @@ class RegLossObj : public IObjFunction{
} }
virtual ~RegLossObj(void) {} virtual ~RegLossObj(void) {}
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("scale_pos_weight", name)) { if (!strcmp("scale_pos_weight", name)) {
scale_pos_weight = static_cast<float>(atof(val)); scale_pos_weight = static_cast<float>(atof(val));
} }
@ -123,7 +124,7 @@ class RegLossObj : public IObjFunction{
float p = loss.PredTransform(preds[i]); float p = loss.PredTransform(preds[i]);
float w = info.GetWeight(j); float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight; if (info.labels[j] == 1.0f) w *= scale_pos_weight;
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w, gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w); loss.SecondOrderGradient(p, info.labels[j]) * w);
} }
} }
@ -156,6 +157,7 @@ class SoftmaxMultiClassObj : public IObjFunction {
} }
virtual ~SoftmaxMultiClassObj(void) {} virtual ~SoftmaxMultiClassObj(void) {}
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp( "num_class", name )) nclass = atoi(val); if (!strcmp( "num_class", name )) nclass = atoi(val);
} }
virtual void GetGradient(const std::vector<float> &preds, virtual void GetGradient(const std::vector<float> &preds,
@ -247,6 +249,7 @@ class LambdaRankObj : public IObjFunction {
} }
virtual ~LambdaRankObj(void) {} virtual ~LambdaRankObj(void) {}
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val); if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val)); if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val); if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
@ -419,8 +422,8 @@ class LambdaRankObjNDCG : public LambdaRankObj {
for (size_t i = 0; i < pairs.size(); ++i) { for (size_t i = 0; i < pairs.size(); ++i) {
unsigned pos_idx = pairs[i].pos_index; unsigned pos_idx = pairs[i].pos_index;
unsigned neg_idx = pairs[i].neg_index; unsigned neg_idx = pairs[i].neg_index;
float pos_loginv = 1.0f / logf(pos_idx + 2.0f); float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
float neg_loginv = 1.0f / logf(neg_idx + 2.0f); float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
int pos_label = static_cast<int>(sorted_list[pos_idx].label); int pos_label = static_cast<int>(sorted_list[pos_idx].label);
int neg_label = static_cast<int>(sorted_list[neg_idx].label); int neg_label = static_cast<int>(sorted_list[neg_idx].label);
float original = float original =
@ -438,7 +441,7 @@ class LambdaRankObjNDCG : public LambdaRankObj {
for (size_t i = 0; i < labels.size(); ++i) { for (size_t i = 0; i < labels.size(); ++i) {
const unsigned rel = static_cast<unsigned>(labels[i]); const unsigned rel = static_cast<unsigned>(labels[i]);
if (rel != 0) { if (rel != 0) {
sumdcg += ((1 << rel) - 1) / logf(static_cast<float>(i + 2)); sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
} }
} }
return static_cast<float>(sumdcg); return static_cast<float>(sumdcg);

View File

@ -67,6 +67,7 @@ namespace xgboost {
namespace learner { namespace learner {
/*! \brief factory funciton to create objective function by name */ /*! \brief factory funciton to create objective function by name */
inline IObjFunction* CreateObjFunction(const char *name) { inline IObjFunction* CreateObjFunction(const char *name) {
using namespace std;
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare); if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik); if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify); if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);

View File

@ -53,7 +53,7 @@ class TreeModel {
Param(void) { Param(void) {
max_depth = 0; max_depth = 0;
size_leaf_vector = 0; size_leaf_vector = 0;
memset(reserved, 0, sizeof(reserved)); std::memset(reserved, 0, sizeof(reserved));
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -61,6 +61,7 @@ class TreeModel {
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("num_roots", name)) num_roots = atoi(val); if (!strcmp("num_roots", name)) num_roots = atoi(val);
if (!strcmp("num_feature", name)) num_feature = atoi(val); if (!strcmp("num_feature", name)) num_feature = atoi(val);
if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val); if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
@ -272,6 +273,7 @@ class TreeModel {
param.num_nodes = param.num_roots; param.num_nodes = param.num_roots;
nodes.resize(param.num_nodes); nodes.resize(param.num_nodes);
stats.resize(param.num_nodes); stats.resize(param.num_nodes);
leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
for (int i = 0; i < param.num_nodes; i ++) { for (int i = 0; i < param.num_nodes; i ++) {
nodes[i].set_leaf(0.0f); nodes[i].set_leaf(0.0f);
nodes[i].set_parent(-1); nodes[i].set_parent(-1);
@ -289,6 +291,9 @@ class TreeModel {
"TreeModel: wrong format"); "TreeModel: wrong format");
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0, utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
"TreeModel: wrong format"); "TreeModel: wrong format");
if (param.size_leaf_vector != 0) {
utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
}
// chg deleted nodes // chg deleted nodes
deleted_nodes.resize(0); deleted_nodes.resize(0);
for (int i = param.num_roots; i < param.num_nodes; i ++) { for (int i = param.num_roots; i < param.num_nodes; i ++) {
@ -309,6 +314,7 @@ class TreeModel {
fo.Write(&param, sizeof(Param)); fo.Write(&param, sizeof(Param));
fo.Write(&nodes[0], sizeof(Node) * nodes.size()); fo.Write(&nodes[0], sizeof(Node) * nodes.size());
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size()); fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
} }
/*! /*!
* \brief add child nodes to node * \brief add child nodes to node
@ -486,15 +492,15 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
std::fill(data.begin(), data.end(), e); std::fill(data.begin(), data.end(), e);
} }
/*! \brief fill the vector with sparse vector */ /*! \brief fill the vector with sparse vector */
inline void Fill(const SparseBatch::Inst &inst) { inline void Fill(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].fvalue = inst[i].fvalue; data[inst[i].index].fvalue = inst[i].fvalue;
} }
} }
/*! \brief drop the trace after fill, must be called after fill */ /*! \brief drop the trace after fill, must be called after fill */
inline void Drop(const SparseBatch::Inst &inst) { inline void Drop(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].flag = -1; data[inst[i].index].flag = -1;
} }
} }
/*! \brief get ith value */ /*! \brief get ith value */

View File

@ -22,10 +22,10 @@ struct TrainParam{
//----- the rest parameters are less important ---- //----- the rest parameters are less important ----
// minimum amount of hessian(weight) allowed in a child // minimum amount of hessian(weight) allowed in a child
float min_child_weight; float min_child_weight;
// weight decay parameter used to control leaf fitting // L2 regularization factor
float reg_lambda; float reg_lambda;
// reg method // L1 regularization factor
int reg_method; float reg_alpha;
// default direction choice // default direction choice
int default_direction; int default_direction;
// whether we want to do subsample // whether we want to do subsample
@ -36,6 +36,8 @@ struct TrainParam{
float colsample_bytree; float colsample_bytree;
// speed optimization for dense column // speed optimization for dense column
float opt_dense_col; float opt_dense_col;
// leaf vector size
int size_leaf_vector;
// number of threads to be used for tree construction, // number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default // if OpenMP is enabled, if equals 0, use system default
int nthread; int nthread;
@ -45,13 +47,14 @@ struct TrainParam{
min_child_weight = 1.0f; min_child_weight = 1.0f;
max_depth = 6; max_depth = 6;
reg_lambda = 1.0f; reg_lambda = 1.0f;
reg_method = 2; reg_alpha = 0.0f;
default_direction = 0; default_direction = 0;
subsample = 1.0f; subsample = 1.0f;
colsample_bytree = 1.0f; colsample_bytree = 1.0f;
colsample_bylevel = 1.0f; colsample_bylevel = 1.0f;
opt_dense_col = 1.0f; opt_dense_col = 1.0f;
nthread = 0; nthread = 0;
size_leaf_vector = 0;
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -59,19 +62,22 @@ struct TrainParam{
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
using namespace std;
// sync-names // sync-names
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val)); if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val)); if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val)); if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val)); if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val)); if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val)); if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val)); if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "reg_method")) reg_method = atoi(val); if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val)); if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val)); if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val)); if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val)); if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val);
if (!strcmp(name, "default_direction")) { if (!strcmp(name, "default_direction")) {
@ -82,31 +88,31 @@ struct TrainParam{
} }
// calculate the cost of loss function // calculate the cost of loss function
inline double CalcGain(double sum_grad, double sum_hess) const { inline double CalcGain(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) { if (sum_hess < min_child_weight) return 0.0;
return 0.0; if (reg_alpha == 0.0f) {
return Sqr(sum_grad) / (sum_hess + reg_lambda);
} else {
return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
} }
switch (reg_method) { }
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess; // calculate cost of loss function with four stati
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda); inline double CalcGain(double sum_grad, double sum_hess,
case 3 : return double test_grad, double test_hess) const {
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) / double w = CalcWeight(sum_grad, sum_hess);
(sum_hess + 0.5 * reg_lambda); double ret = test_grad * w + 0.5 * (test_hess + reg_lambda) * Sqr(w);
default: return Sqr(sum_grad) / sum_hess; if (reg_alpha == 0.0f) {
return - 2.0 * ret;
} else {
return - 2.0 * (ret + reg_alpha * std::abs(w));
} }
} }
// calculate weight given the statistics // calculate weight given the statistics
inline double CalcWeight(double sum_grad, double sum_hess) const { inline double CalcWeight(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) { if (sum_hess < min_child_weight) return 0.0;
return 0.0; if (reg_alpha == 0.0f) {
return -sum_grad / (sum_hess + reg_lambda);
} else { } else {
switch (reg_method) { return -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
case 2: return - sum_grad / (sum_hess + reg_lambda);
case 3: return
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
(sum_hess + 0.5 * reg_lambda);
default: return - sum_grad / sum_hess;
}
} }
} }
/*! \brief whether need forward small to big search: default right */ /*! \brief whether need forward small to big search: default right */
@ -153,6 +159,9 @@ struct GradStats {
inline void Clear(void) { inline void Clear(void) {
sum_grad = sum_hess = 0.0f; sum_grad = sum_hess = 0.0f;
} }
/*! \brief check if necessary information is ready */
inline static void CheckInfo(const BoosterInfo &info) {
}
/*! /*!
* \brief accumulate statistics, * \brief accumulate statistics,
* \param gpair the vector storing the gradient statistics * \param gpair the vector storing the gradient statistics
@ -189,13 +198,87 @@ struct GradStats {
/*! \brief set leaf vector value based on statistics */ /*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{ inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
} }
protected: // constructor to allow inheritance
GradStats(void) {}
/*! \brief add statistics to the data */ /*! \brief add statistics to the data */
inline void Add(double grad, double hess) { inline void Add(double grad, double hess) {
sum_grad += grad; sum_hess += hess; sum_grad += grad; sum_hess += hess;
} }
}; };
/*! \brief vectorized cv statistics */
template<unsigned vsize>
struct CVGradStats : public GradStats {
// additional statistics
GradStats train[vsize], valid[vsize];
// constructor
explicit CVGradStats(const TrainParam &param) {
utils::Check(param.size_leaf_vector == vsize,
"CVGradStats: vsize must match size_leaf_vector");
this->Clear();
}
/*! \brief check if necessary information is ready */
inline static void CheckInfo(const BoosterInfo &info) {
utils::Check(info.fold_index.size() != 0,
"CVGradStats: require fold_index");
}
/*! \brief clear the statistics */
inline void Clear(void) {
GradStats::Clear();
for (unsigned i = 0; i < vsize; ++i) {
train[i].Clear(); valid[i].Clear();
}
}
inline void Add(const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
bst_uint ridx) {
GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
const size_t step = info.fold_index.size();
for (unsigned i = 0; i < vsize; ++i) {
const bst_gpair &b = gpair[(i + 1) * step + ridx];
if (info.fold_index[ridx] == i) {
valid[i].Add(b.grad, b.hess);
} else {
train[i].Add(b.grad, b.hess);
}
}
}
/*! \brief calculate gain of the solution */
inline double CalcGain(const TrainParam &param) const {
double ret = 0.0;
for (unsigned i = 0; i < vsize; ++i) {
ret += param.CalcGain(train[i].sum_grad,
train[i].sum_hess,
vsize * valid[i].sum_grad,
vsize * valid[i].sum_hess);
}
return ret / vsize;
}
/*! \brief add statistics to the data */
inline void Add(const CVGradStats &b) {
GradStats::Add(b);
for (unsigned i = 0; i < vsize; ++i) {
train[i].Add(b.train[i]);
valid[i].Add(b.valid[i]);
}
}
/*! \brief set current value to a - b */
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
GradStats::SetSubstract(a, b);
for (int i = 0; i < vsize; ++i) {
train[i].SetSubstract(a.train[i], b.train[i]);
valid[i].SetSubstract(a.valid[i], b.valid[i]);
}
}
/*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
for (int i = 0; i < vsize; ++i) {
vec[i] = param.learning_rate *
param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
}
}
};
/*! /*!
* \brief statistics that is helpful to store * \brief statistics that is helpful to store
* and represent a split solution for the tree * and represent a split solution for the tree
@ -216,11 +299,11 @@ struct SplitEntry{
* \param loss_chg the loss reduction get through the split * \param loss_chg the loss reduction get through the split
* \param split_index the feature index where the split is on * \param split_index the feature index where the split is on
*/ */
inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const { inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
if (this->split_index() <= split_index) { if (this->split_index() <= split_index) {
return loss_chg > this->loss_chg; return new_loss_chg > this->loss_chg;
} else { } else {
return !(this->loss_chg > loss_chg); return !(this->loss_chg > new_loss_chg);
} }
} }
/*! /*!
@ -246,13 +329,13 @@ struct SplitEntry{
* \param default_left whether the missing value goes to left * \param default_left whether the missing value goes to left
* \return whether the proposed split is better and can replace current split * \return whether the proposed split is better and can replace current split
*/ */
inline bool Update(bst_float loss_chg, unsigned split_index, inline bool Update(bst_float new_loss_chg, unsigned split_index,
float split_value, bool default_left) { float new_split_value, bool default_left) {
if (this->NeedReplace(loss_chg, split_index)) { if (this->NeedReplace(new_loss_chg, split_index)) {
this->loss_chg = loss_chg; this->loss_chg = new_loss_chg;
if (default_left) split_index |= (1U << 31); if (default_left) split_index |= (1U << 31);
this->sindex = split_index; this->sindex = split_index;
this->split_value = split_value; this->split_value = new_split_value;
return true; return true;
} else { } else {
return false; return false;

21
src/tree/updater.cpp Normal file
View File

@ -0,0 +1,21 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstring>
#include "./updater.h"
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
IUpdater* CreateUpdater(const char *name) {
using namespace std;
if (!strcmp(name, "prune")) return new TreePruner();
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree
} // namespace xgboost

View File

@ -14,9 +14,7 @@ namespace xgboost {
namespace tree { namespace tree {
/*! /*!
* \brief interface of tree update module, that performs update of a tree * \brief interface of tree update module, that performs update of a tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IUpdater { class IUpdater {
public: public:
/*! /*!
@ -28,7 +26,7 @@ class IUpdater {
/*! /*!
* \brief peform update to the tree models * \brief peform update to the tree models
* \param gpair the gradient pair statistics of the data * \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info extra side information that may be need, such as root index * \param info extra side information that may be need, such as root index
* \param trees pointer to the trese to be updated, upater will change the content of the tree * \param trees pointer to the trese to be updated, upater will change the content of the tree
* note: all the trees in the vector are updated, with the same statistics, * note: all the trees in the vector are updated, with the same statistics,
@ -36,36 +34,18 @@ class IUpdater {
* there can be multiple trees when we train random forest style model * there can be multiple trees when we train random forest style model
*/ */
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) = 0; const std::vector<RegTree*> &trees) = 0;
// destructor // destructor
virtual ~IUpdater(void) {} virtual ~IUpdater(void) {}
}; };
} // namespace tree
} // namespace xgboost
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
/*! /*!
* \brief create a updater based on name * \brief create a updater based on name
* \param name name of updater * \param name name of updater
* \return return the updater instance * \return return the updater instance
*/ */
template<typename FMatrix> IUpdater* CreateUpdater(const char *name);
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_H_ #endif // XGBOOST_TREE_UPDATER_H_

View File

@ -15,8 +15,8 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class ColMaker: public IUpdater<FMatrix> { class ColMaker: public IUpdater {
public: public:
virtual ~ColMaker(void) {} virtual ~ColMaker(void) {}
// set training parameter // set training parameter
@ -24,16 +24,17 @@ class ColMaker: public IUpdater<FMatrix> {
param.SetParam(name, val); param.SetParam(name, val);
} }
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info);
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
float lr = param.learning_rate; float lr = param.learning_rate;
param.learning_rate = lr / trees.size(); param.learning_rate = lr / trees.size();
// build tree // build tree
for (size_t i = 0; i < trees.size(); ++i) { for (size_t i = 0; i < trees.size(); ++i) {
Builder builder(param); Builder builder(param);
builder.Update(gpair, fmat, info, trees[i]); builder.Update(gpair, p_fmat, info, trees[i]);
} }
param.learning_rate = lr; param.learning_rate = lr;
} }
@ -76,23 +77,22 @@ class ColMaker: public IUpdater<FMatrix> {
explicit Builder(const TrainParam &param) : param(param) {} explicit Builder(const TrainParam &param) : param(param) {}
// update one tree, growing // update one tree, growing
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
this->InitData(gpair, fmat, info.root_index, *p_tree); this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) { for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree); this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree); this->ResetPosition(qexpand_, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand); this->UpdateQueueExpand(*p_tree, &qexpand_);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
// if nothing left to be expand, break // if nothing left to be expand, break
if (qexpand.size() == 0) break; if (qexpand_.size() == 0) break;
} }
// set all the rest expanding nodes to leaf // set all the rest expanding nodes to leaf
for (size_t i = 0; i < qexpand.size(); ++i) { for (size_t i = 0; i < qexpand_.size(); ++i) {
const int nid = qexpand[i]; const int nid = qexpand_[i];
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate); (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
} }
// remember auxiliary statistics in the tree node // remember auxiliary statistics in the tree node
@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
private: private:
// initialize temp data structure // initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair, inline void InitData(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) { const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
@ -138,7 +138,6 @@ class ColMaker: public IUpdater<FMatrix> {
} }
} }
} }
{ {
// initialize feature index // initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol()); unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@ -166,16 +165,16 @@ class ColMaker: public IUpdater<FMatrix> {
snode.reserve(256); snode.reserve(256);
} }
{// expand query {// expand query
qexpand.reserve(256); qexpand.clear(); qexpand_.reserve(256); qexpand_.clear();
for (int i = 0; i < tree.param.num_roots; ++i) { for (int i = 0; i < tree.param.num_roots; ++i) {
qexpand.push_back(i); qexpand_.push_back(i);
} }
} }
} }
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand, inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const BoosterInfo &info, const BoosterInfo &info,
const RegTree &tree) { const RegTree &tree) {
{// setup statistics space for each tree node {// setup statistics space for each tree node
@ -222,24 +221,26 @@ class ColMaker: public IUpdater<FMatrix> {
qexpand = newnodes; qexpand = newnodes;
} }
// enumerate the split values of specific feature // enumerate the split values of specific feature
template<typename Iter> inline void EnumerateSplit(const ColBatch::Entry *begin,
inline void EnumerateSplit(Iter it, unsigned fid, const ColBatch::Entry *end,
int d_step,
bst_uint fid,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<ThreadEntry> &temp, std::vector<ThreadEntry> &temp) {
bool is_forward_search) { const std::vector<int> &qexpand = qexpand_;
// clear all the temp statistics // clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) { for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear(); temp[qexpand[j]].stats.Clear();
} }
// left statistics // left statistics
TStats c(param); TStats c(param);
while (it.Next()) { for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
const bst_uint ridx = it.rindex(); const bst_uint ridx = it->index;
const int nid = position[ridx]; const int nid = position[ridx];
if (nid < 0) continue; if (nid < 0) continue;
// start working // start working
const float fvalue = it.fvalue(); const float fvalue = it->fvalue;
// get the statistics of nid // get the statistics of nid
ThreadEntry &e = temp[nid]; ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init // test if first hit, this is fine, because we set 0 during init
@ -248,11 +249,11 @@ class ColMaker: public IUpdater<FMatrix> {
e.last_fvalue = fvalue; e.last_fvalue = fvalue;
} else { } else {
// try to find a split // try to find a split
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) { if (c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search); e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
} }
} }
// update the statistics // update the statistics
@ -267,38 +268,46 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = is_forward_search ? rt_eps : -rt_eps; const float delta = d_step == +1 ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search); e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
} }
} }
} }
// find splits at current level, do split per level // find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand, inline void FindSplit(int depth,
const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index; std::vector<bst_uint> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) { if (param.colsample_bylevel != 1.0f) {
random::Shuffle(feat_set); random::Shuffle(feat_set);
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size()); unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included"); utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
feat_set.resize(n); feat_set.resize(n);
} }
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration // start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size()); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#if defined(_OPENMP) #if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1); const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif #endif
#pragma omp parallel for schedule(dynamic, batch_size) #pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
const unsigned fid = feat_set[i]; const bst_uint fid = batch.col_index[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
if (param.need_forward_search(fmat.GetColDensity(fid))) { const ColBatch::Inst c = batch[i];
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true); if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
}
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
} }
if (param.need_backward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false);
} }
} }
// after this each thread's stemp will get the best candidates, aggregate results // after this each thread's stemp will get the best candidates, aggregate results
@ -318,8 +327,8 @@ class ColMaker: public IUpdater<FMatrix> {
} }
} }
// reset position of each data points after split is created in the tree // reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) { inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1 // step 1, set default direct nodes to default, and leaf nodes to -1
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -343,19 +352,24 @@ class ColMaker: public IUpdater<FMatrix> {
} }
std::sort(fsplits.begin(), fsplits.end()); std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin()); fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size()); utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
#pragma omp parallel for schedule(dynamic, 1) while (iter->Next()) {
for (bst_omp_uint i = 0; i < nfeats; ++i) { const ColBatch &batch = iter->Value();
const unsigned fid = fsplits[i]; for (size_t i = 0; i < batch.size; ++i) {
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { ColBatch::Inst col = batch[i];
const bst_uint ridx = it.rindex(); const bst_uint fid = batch.col_index[i];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
int nid = position[ridx]; int nid = position[ridx];
if (nid == -1) continue; if (nid == -1) continue;
// go back to parent, correct those who are not default // go back to parent, correct those who are not default
nid = tree[nid].parent(); nid = tree[nid].parent();
if (tree[nid].split_index() == fid) { if (tree[nid].split_index() == fid) {
if (it.fvalue() < tree[nid].split_cond()) { if (fvalue < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft(); position[ridx] = tree[nid].cleft();
} else { } else {
position[ridx] = tree[nid].cright(); position[ridx] = tree[nid].cright();
@ -364,12 +378,13 @@ class ColMaker: public IUpdater<FMatrix> {
} }
} }
} }
}
//--data fields-- //--data fields--
const TrainParam &param; const TrainParam &param;
// number of omp thread used during training // number of omp thread used during training
int nthread; int nthread;
// Per feature: shuffle index of each feature index // Per feature: shuffle index of each feature index
std::vector<unsigned> feat_index; std::vector<bst_uint> feat_index;
// Instance Data: current node position in the tree of each instance // Instance Data: current node position in the tree of each instance
std::vector<int> position; std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction // PerThread x PerTreeNode: statistics for per thread construction
@ -377,7 +392,7 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief TreeNode Data: statistics for each constructed node */ /*! \brief TreeNode Data: statistics for each constructed node */
std::vector<NodeEntry> snode; std::vector<NodeEntry> snode;
/*! \brief queue of nodes to be expanded */ /*! \brief queue of nodes to be expanded */
std::vector<int> qexpand; std::vector<int> qexpand_;
}; };
}; };

View File

@ -12,18 +12,18 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix> class TreePruner: public IUpdater {
class TreePruner: public IUpdater<FMatrix> {
public: public:
virtual ~TreePruner(void) {} virtual ~TreePruner(void) {}
// set training parameter // set training parameter
virtual void SetParam(const char *name, const char *val) { virtual void SetParam(const char *name, const char *val) {
using namespace std;
param.SetParam(name, val); param.SetParam(name, val);
if (!strcmp(name, "silent")) silent = atoi(val); if (!strcmp(name, "silent")) silent = atoi(val);
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
@ -64,7 +64,7 @@ class TreePruner: public IUpdater<FMatrix> {
} }
} }
if (silent == 0) { if (silent == 0) {
printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth()); tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
} }
} }
@ -75,7 +75,6 @@ class TreePruner: public IUpdater<FMatrix> {
// training parameter // training parameter
TrainParam param; TrainParam param;
}; };
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ #endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_

View File

@ -9,12 +9,13 @@
#include <limits> #include <limits>
#include "./param.h" #include "./param.h"
#include "./updater.h" #include "./updater.h"
#include "../utils/omp.h"
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class TreeRefresher: public IUpdater<FMatrix> { class TreeRefresher: public IUpdater {
public: public:
virtual ~TreeRefresher(void) {} virtual ~TreeRefresher(void) {}
// set training parameter // set training parameter
@ -23,16 +24,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
if (trees.size() == 0) return; if (trees.size() == 0) return;
// number of threads // number of threads
int nthread;
// thread temporal space // thread temporal space
std::vector< std::vector<TStats> > stemp; std::vector< std::vector<TStats> > stemp;
std::vector<RegTree::FVec> fvec_temp; std::vector<RegTree::FVec> fvec_temp;
// setup temp space for each thread // setup temp space for each thread
int nthread;
#pragma omp parallel #pragma omp parallel
{ {
nthread = omp_get_num_threads(); nthread = omp_get_num_threads();
@ -50,16 +51,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
fvec_temp[tid].Init(trees[0]->param.num_feature); fvec_temp[tid].Init(trees[0]->param.num_feature);
} }
// start accumulating statistics // start accumulating statistics
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(), utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size "); "too large batch size ");
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) { for (bst_omp_uint i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i); const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid]; RegTree::FVec &feats = fvec_temp[tid];
@ -126,8 +127,6 @@ class TreeRefresher: public IUpdater<FMatrix> {
this->Refresh(gstats, tree[nid].cright(), p_tree); this->Refresh(gstats, tree[nid].cright(), p_tree);
} }
} }
// number of thread in the data
int nthread;
// training parameter // training parameter
TrainParam param; TrainParam param;
}; };

View File

@ -24,15 +24,15 @@ class FeatMap {
// function definitions // function definitions
/*! \brief load feature map from text format */ /*! \brief load feature map from text format */
inline void LoadText(const char *fname) { inline void LoadText(const char *fname) {
FILE *fi = utils::FopenCheck(fname, "r"); std::FILE *fi = utils::FopenCheck(fname, "r");
this->LoadText(fi); this->LoadText(fi);
fclose(fi); std::fclose(fi);
} }
/*! \brief load feature map from text format */ /*! \brief load feature map from text format */
inline void LoadText(FILE *fi) { inline void LoadText(std::FILE *fi) {
int fid; int fid;
char fname[1256], ftype[1256]; char fname[1256], ftype[1256];
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) { while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
this->PushBack(fid, fname, ftype); this->PushBack(fid, fname, ftype);
} }
} }
@ -62,6 +62,7 @@ class FeatMap {
private: private:
inline static Type GetType(const char *tname) { inline static Type GetType(const char *tname) {
using namespace std;
if (!strcmp("i", tname)) return kIndicator; if (!strcmp("i", tname)) return kIndicator;
if (!strcmp("q", tname)) return kQuantitive; if (!strcmp("q", tname)) return kQuantitive;
if (!strcmp("int", tname)) return kInteger; if (!strcmp("int", tname)) return kInteger;

View File

@ -91,22 +91,21 @@ class IStream {
/*! \brief implementation of file i/o stream */ /*! \brief implementation of file i/o stream */
class FileStream : public IStream { class FileStream : public IStream {
private: private:
FILE *fp; std::FILE *fp;
public: public:
explicit FileStream(FILE *fp) { explicit FileStream(std::FILE *fp) : fp(fp) {
this->fp = fp;
} }
virtual size_t Read(void *ptr, size_t size) { virtual size_t Read(void *ptr, size_t size) {
return fread(ptr, size, 1, fp); return std::fread(ptr, size, 1, fp);
} }
virtual void Write(const void *ptr, size_t size) { virtual void Write(const void *ptr, size_t size) {
fwrite(ptr, size, 1, fp); std::fwrite(ptr, size, 1, fp);
} }
inline void Seek(size_t pos) { inline void Seek(size_t pos) {
fseek(fp, 0, SEEK_SET); std::fseek(fp, 0, SEEK_SET);
} }
inline void Close(void) { inline void Close(void) {
fclose(fp); std::fclose(fp);
} }
}; };

View File

@ -9,13 +9,8 @@
#include <omp.h> #include <omp.h>
#else #else
#ifndef DISABLE_OPENMP #ifndef DISABLE_OPENMP
#ifndef _MSC_VER // use pragma message instead of warning
#warning "OpenMP is not available, compile to single thread code."\ #pragma message ("Warning: OpenMP is not available, xgboost will be compiled into single-thread code. Use OpenMP-enabled compiler to get benefit of multi-threading")
"You may want to ungrade your compiler to enable OpenMP support,"\
"to get benefit of multi-threading."
#else
// TODO add warning for msvc
#endif
#endif #endif
inline int omp_get_thread_num() { return 0; } inline int omp_get_thread_num() { return 0; }
inline int omp_get_num_threads() { return 1; } inline int omp_get_num_threads() { return 1; }

View File

@ -16,30 +16,21 @@
/*! namespace of PRNG */ /*! namespace of PRNG */
namespace xgboost { namespace xgboost {
namespace random { namespace random {
#ifndef XGBOOST_CUSTOMIZE_PRNG_
/*! \brief seed the PRNG */ /*! \brief seed the PRNG */
inline void Seed(uint32_t seed) { inline void Seed(unsigned seed) {
srand(seed); srand(seed);
} }
/*! \brief return a real number uniform in [0,1) */ /*! \brief basic function, uniform */
inline double NextDouble(void) { inline double Uniform(void) {
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0); return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
} }
/*! \brief return a real numer uniform in (0,1) */ /*! \brief return a real numer uniform in (0,1) */
inline double NextDouble2(void) { inline double NextDouble2(void) {
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0); return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
} }
/*! \brief return a random number */
inline uint32_t NextUInt32(void) {
return (uint32_t)rand();
}
/*! \brief return a random number in n */
inline uint32_t NextUInt32(uint32_t n) {
return (uint32_t)floor(NextDouble() * n);
}
/*! \brief return x~N(0,1) */ /*! \brief return x~N(0,1) */
inline double SampleNormal() { inline double Normal(void) {
double x, y, s; double x, y, s;
do { do {
x = 2 * NextDouble2() - 1.0; x = 2 * NextDouble2() - 1.0;
@ -49,22 +40,24 @@ inline double SampleNormal() {
return x * sqrt(-2.0 * log(s) / s); return x * sqrt(-2.0 * log(s) / s);
} }
#else
// include declarations, to be implemented
void Seed(unsigned seed);
double Uniform(void);
double Normal(void);
#endif
/*! \brief return iid x,y ~N(0,1) */ /*! \brief return a real number uniform in [0,1) */
inline void SampleNormal2D(double &xx, double &yy) { inline double NextDouble(void) {
double x, y, s; return Uniform();
do { }
x = 2 * NextDouble2() - 1.0; /*! \brief return a random number in n */
y = 2 * NextDouble2() - 1.0; inline uint32_t NextUInt32(uint32_t n) {
s = x*x + y*y; return (uint32_t)std::floor(NextDouble() * n);
} while (s >= 1.0 || s == 0.0);
double t = sqrt(-2.0 * log(s) / s);
xx = x * t;
yy = y * t;
} }
/*! \brief return x~N(mu,sigma^2) */ /*! \brief return x~N(mu,sigma^2) */
inline double SampleNormal(double mu, double sigma) { inline double SampleNormal(double mu, double sigma) {
return SampleNormal() * sigma + mu; return Normal() * sigma + mu;
} }
/*! \brief return 1 with probability p, coin flip */ /*! \brief return 1 with probability p, coin flip */
inline int SampleBinary(double p) { inline int SampleBinary(double p) {
@ -90,7 +83,7 @@ struct Random{
inline void Seed(unsigned sd) { inline void Seed(unsigned sd) {
this->rseed = sd; this->rseed = sd;
#if defined(_MSC_VER)||defined(_WIN32) #if defined(_MSC_VER)||defined(_WIN32)
srand(rseed); ::xgboost::random::Seed(sd);
#endif #endif
} }
/*! \brief return a real number uniform in [0,1) */ /*! \brief return a real number uniform in [0,1) */
@ -98,8 +91,8 @@ struct Random{
// use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
// For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general // For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
// todo, replace with another PRNG // todo, replace with another PRNG
#if defined(_MSC_VER)||defined(_WIN32) #if defined(_MSC_VER)||defined(_WIN32)||defined(XGBOOST_STRICT_CXX98_)
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0); return Uniform();
#else #else
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0); return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
#endif #endif

View File

@ -7,10 +7,18 @@
*/ */
#define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS
#include <cstdio> #include <cstdio>
#include <cstdarg> #include <string>
#include <cstdlib> #include <cstdlib>
#include <vector>
#ifndef XGBOOST_STRICT_CXX98_
#include <cstdarg>
#endif
#if !defined(__GNUC__)
#define fopen64 std::fopen
#endif
#ifdef _MSC_VER #ifdef _MSC_VER
#define fopen64 fopen
// NOTE: sprintf_s is not equivalent to snprintf, // NOTE: sprintf_s is not equivalent to snprintf,
// they are equivalent when success, which is sufficient for our case // they are equivalent when success, which is sufficient for our case
#define snprintf sprintf_s #define snprintf sprintf_s
@ -18,19 +26,18 @@
#else #else
#ifdef _FILE_OFFSET_BITS #ifdef _FILE_OFFSET_BITS
#if _FILE_OFFSET_BITS == 32 #if _FILE_OFFSET_BITS == 32
#warning "FILE OFFSET BITS defined to be 32 bit" #pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
#endif #endif
#endif #endif
#ifdef __APPLE__ #ifdef __APPLE__
#define off64_t off_t #define off64_t off_t
#define fopen64 fopen #define fopen64 std::fopen
#endif #endif
#define _FILE_OFFSET_BITS 64
extern "C" { extern "C" {
#include <sys/types.h> #include <sys/types.h>
}; }
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
@ -46,10 +53,11 @@ typedef long int64_t;
namespace xgboost { namespace xgboost {
/*! \brief namespace for helper utils of the project */ /*! \brief namespace for helper utils of the project */
namespace utils { namespace utils {
/*! \brief error message buffer length */
const int kErrorBuffer = 1 << 12;
#ifndef XGBOOST_CUSTOMIZE_ERROR_ /*! \brief error message buffer length */
const int kPrintBuffer = 1 << 12;
#ifndef XGBOOST_CUSTOMIZE_MSG_
/*! /*!
* \brief handling of Assert error, caused by in-apropriate input * \brief handling of Assert error, caused by in-apropriate input
* \param msg error message * \param msg error message
@ -66,19 +74,50 @@ inline void HandleCheckError(const char *msg) {
fprintf(stderr, "%s\n", msg); fprintf(stderr, "%s\n", msg);
exit(-1); exit(-1);
} }
inline void HandlePrint(const char *msg) {
printf("%s", msg);
}
#else #else
#ifndef XGBOOST_STRICT_CXX98_
// include declarations, some one must implement this // include declarations, some one must implement this
void HandleAssertError(const char *msg); void HandleAssertError(const char *msg);
void HandleCheckError(const char *msg); void HandleCheckError(const char *msg);
void HandlePrint(const char *msg);
#endif #endif
#endif
#ifdef XGBOOST_STRICT_CXX98_
// these function pointers are to be assigned
extern "C" void (*Printf)(const char *fmt, ...);
extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
extern "C" void (*Assert)(int exp, const char *fmt, ...);
extern "C" void (*Check)(int exp, const char *fmt, ...);
extern "C" void (*Error)(const char *fmt, ...);
#else
/*! \brief printf, print message to the console */
inline void Printf(const char *fmt, ...) {
std::string msg(kPrintBuffer, '\0');
va_list args;
va_start(args, fmt);
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
va_end(args);
HandlePrint(msg.c_str());
}
/*! \brief portable version of snprintf */
inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
va_list args;
va_start(args, fmt);
int ret = vsnprintf(buf, size, fmt, args);
va_end(args);
return ret;
}
/*! \brief assert an condition is true, use this to handle debug information */ /*! \brief assert an condition is true, use this to handle debug information */
inline void Assert(bool exp, const char *fmt, ...) { inline void Assert(bool exp, const char *fmt, ...) {
if (!exp) { if (!exp) {
std::string msg(kErrorBuffer, '\0'); std::string msg(kPrintBuffer, '\0');
va_list args; va_list args;
va_start(args, fmt); va_start(args, fmt);
vsnprintf(&msg[0], kErrorBuffer, fmt, args); vsnprintf(&msg[0], kPrintBuffer, fmt, args);
va_end(args); va_end(args);
HandleAssertError(msg.c_str()); HandleAssertError(msg.c_str());
} }
@ -87,10 +126,10 @@ inline void Assert(bool exp, const char *fmt, ...) {
/*!\brief same as assert, but this is intended to be used as message for user*/ /*!\brief same as assert, but this is intended to be used as message for user*/
inline void Check(bool exp, const char *fmt, ...) { inline void Check(bool exp, const char *fmt, ...) {
if (!exp) { if (!exp) {
std::string msg(kErrorBuffer, '\0'); std::string msg(kPrintBuffer, '\0');
va_list args; va_list args;
va_start(args, fmt); va_start(args, fmt);
vsnprintf(&msg[0], kErrorBuffer, fmt, args); vsnprintf(&msg[0], kPrintBuffer, fmt, args);
va_end(args); va_end(args);
HandleCheckError(msg.c_str()); HandleCheckError(msg.c_str());
} }
@ -99,22 +138,41 @@ inline void Check(bool exp, const char *fmt, ...) {
/*! \brief report error message, same as check */ /*! \brief report error message, same as check */
inline void Error(const char *fmt, ...) { inline void Error(const char *fmt, ...) {
{ {
std::string msg(kErrorBuffer, '\0'); std::string msg(kPrintBuffer, '\0');
va_list args; va_list args;
va_start(args, fmt); va_start(args, fmt);
vsnprintf(&msg[0], kErrorBuffer, fmt, args); vsnprintf(&msg[0], kPrintBuffer, fmt, args);
va_end(args); va_end(args);
HandleCheckError(msg.c_str()); HandleCheckError(msg.c_str());
} }
} }
#endif
/*! \brief replace fopen, report error when the file open fails */ /*! \brief replace fopen, report error when the file open fails */
inline FILE *FopenCheck(const char *fname, const char *flag) { inline std::FILE *FopenCheck(const char *fname, const char *flag) {
FILE *fp = fopen64(fname, flag); std::FILE *fp = fopen64(fname, flag);
Check(fp != NULL, "can not open file \"%s\"\n", fname); Check(fp != NULL, "can not open file \"%s\"\n", fname);
return fp; return fp;
} }
} // namespace utils } // namespace utils
// easy utils that can be directly acessed in xgboost
/*! \brief get the beginning address of a vector */
template<typename T>
inline T *BeginPtr(std::vector<T> &vec) {
if (vec.size() == 0) {
return NULL;
} else {
return &vec[0];
}
}
/*! \brief get the beginning address of a vector */
template<typename T>
inline const T *BeginPtr(const std::vector<T> &vec) {
if (vec.size() == 0) {
return NULL;
} else {
return &vec[0];
}
}
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_UTILS_UTILS_H_ #endif // XGBOOST_UTILS_UTILS_H_

View File

@ -50,6 +50,7 @@ class BoostLearnTask{
if (!strcmp("use_buffer", name)) use_buffer = atoi(val); if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
if (!strcmp("num_round", name)) num_round = atoi(val); if (!strcmp("num_round", name)) num_round = atoi(val);
if (!strcmp("pred_margin", name)) pred_margin = atoi(val); if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val);
if (!strcmp("save_period", name)) save_period = atoi(val); if (!strcmp("save_period", name)) save_period = atoi(val);
if (!strcmp("eval_train", name)) eval_train = atoi(val); if (!strcmp("eval_train", name)) eval_train = atoi(val);
if (!strcmp("task", name)) task = val; if (!strcmp("task", name)) task = val;
@ -79,6 +80,7 @@ class BoostLearnTask{
save_period = 0; save_period = 0;
eval_train = 0; eval_train = 0;
pred_margin = 0; pred_margin = 0;
ntree_limit = 0;
dump_model_stats = 0; dump_model_stats = 0;
task = "train"; task = "train";
model_in = "NULL"; model_in = "NULL";
@ -186,7 +188,7 @@ class BoostLearnTask{
inline void TaskPred(void) { inline void TaskPred(void) {
std::vector<float> preds; std::vector<float> preds;
if (!silent) printf("start prediction...\n"); if (!silent) printf("start prediction...\n");
learner.Predict(*data, pred_margin != 0, &preds); learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < preds.size(); i++) { for (size_t i = 0; i < preds.size(); i++) {
@ -217,6 +219,8 @@ class BoostLearnTask{
std::string task; std::string task;
/*! \brief name of predict file */ /*! \brief name of predict file */
std::string name_pred; std::string name_pred;
/*!\brief limit number of trees in prediction */
int ntree_limit;
/*!\brief whether to directly output margin value */ /*!\brief whether to directly output margin value */
int pred_margin; int pred_margin;
/*! \brief whether dump statistics along with model */ /*! \brief whether dump statistics along with model */
@ -234,7 +238,7 @@ class BoostLearnTask{
std::vector<io::DataMatrix*> deval; std::vector<io::DataMatrix*> deval;
std::vector<const io::DataMatrix*> devalall; std::vector<const io::DataMatrix*> devalall;
utils::FeatMap fmap; utils::FeatMap fmap;
learner::BoostLearner<FMatrixS> learner; learner::BoostLearner learner;
}; };
} }

View File

@ -1,4 +1,4 @@
The solution has been created with Visual Studio Express 2013. The solution has been created with Visual Studio Express 2010.
Make sure to compile the Release version, unless you need to debug the code Make sure to compile the Release version, unless you need to debug the code
(and in the latter case modify the path in xgboost.py from release to test). (and in the latter case modify the path in xgboost.py from release to test).
Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line

View File

@ -1,11 +1,9 @@
 
Microsoft Visual Studio Solution File, Format Version 12.00 Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio Express 2013 for Windows Desktop # Visual Studio 2010
VisualStudioVersion = 12.0.30723.0 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}"
EndProject EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{2E1AF937-28BB-4832-B916-309C9A0F6C4F}" Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
EndProject EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -15,22 +13,21 @@ Global
Release|x64 = Release|x64 Release|x64 = Release|x64
EndGlobalSection EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution GlobalSection(ProjectConfigurationPlatforms) = postSolution
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.ActiveCfg = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.Build.0 = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.ActiveCfg = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.Build.0 = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.ActiveCfg = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.Build.0 = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.ActiveCfg = Release|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.Build.0 = Release|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.ActiveCfg = Debug|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.Build.0 = Debug|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.ActiveCfg = Debug|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.Build.0 = Debug|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.ActiveCfg = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.Build.0 = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.ActiveCfg = Release|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.Build.0 = Release|x64
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

View File

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration> <Configuration>Debug</Configuration>
@ -18,8 +18,14 @@
<Platform>x64</Platform> <Platform>x64</Platform>
</ProjectConfiguration> </ProjectConfiguration>
</ItemGroup> </ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\gbm\gbm.cpp" />
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\tree\updater.cpp" />
<ClCompile Include="..\..\src\xgboost_main.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals"> <PropertyGroup Label="Globals">
<ProjectGuid>{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}</ProjectGuid> <ProjectGuid>{19766C3F-7508-49D0-BAAC-0988FCC9970C}</ProjectGuid>
<RootNamespace>xgboost</RootNamespace> <RootNamespace>xgboost</RootNamespace>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
@ -27,27 +33,23 @@
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
@ -111,10 +113,6 @@
<OptimizeReferences>true</OptimizeReferences> <OptimizeReferences>true</OptimizeReferences>
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\xgboost_main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>

View File

@ -30,17 +30,17 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>

View File

@ -1,126 +0,0 @@
# include xgboost library, must set chdir=TRURE
source("../xgboost.R", chdir=TRUE)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i,findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# test code here
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
# dump model
xgb.dump(bst, "dump.raw.txt")
# dump model with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save dmatrix into binary buffer
succ <- xgb.save(dtest, "dtest.buffer")
# save model into file
succ <- xgb.save(bst, "xgb.model")
# load model and data in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# assert they are the same
stopifnot(sum(abs(preds2-preds)) == 0)
###
# build dmatrix from sparseMatrix
###
print ('start running example of build DMatrix from R.sparseMatrix')
csc <- read.libsvm("agaricus.txt.train", 126)
label <- csc$label
data <- csc$data
dtrain <- xgb.DMatrix(data, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# build dmatrix from dense matrix
###
print ('start running example of build DMatrix from R.Matrix')
mat = as.matrix(data)
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py"s implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ("start running example to start from a initial prediction")
# specify parameters via map, definition are same as c++ version
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ("this is result of running from initial prediction")
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -2,14 +2,11 @@ Wrapper of XGBoost
===== =====
This folder provides wrapper of xgboost to other languages This folder provides wrapper of xgboost to other languages
Python Python
===== =====
* To make the python module, type ```make``` in the root directory of project * To make the python module, type ```make``` in the root directory of project
* Refer to the walk through example in [python-example/demo.py](python-example/demo.py) * Refer also to the walk through example in [demo folder](../demo/guide-python)
R R
===== =====
* To make the R wrapper, type ```make R``` in the root directory of project * See [R-package](../R-package)
* R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)

View File

@ -1,3 +0,0 @@
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
for usage: see demo.py and comments in demo.py

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,121 +0,0 @@
#!/usr/bin/python
import sys
import numpy as np
import scipy.sparse
# append the path to xgboost, you may need to change the following line
# alternatively, you can add the path to PYTHONPATH environment variable
sys.path.append('../')
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, evallist)
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.nice.txt','featmap.txt')
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2-preds)) == 0
###
# build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse')
labels = []
row = []; col = []; dat = []
i = 0
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat)
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# advanced: cutomsized loss function
#
print ('start running example to used cutomized objective function')
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'max_depth':2, 'eta':1, 'silent':1 }
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
def evalerror(preds, dtrain):
labels = dtrain.get_label()
# return a pair metric_name, result
# since preds are margin(before logistic transformation, cutoff at 0)
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# train xgboost for 1 round
bst = xgb.train( param, dtrain, 1, evallist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain = bst.predict(dtrain, output_margin=True)
ptest = bst.predict(dtest, output_margin=True)
dtrain.set_base_margin(ptrain)
dtest.set_base_margin(ptest)
print ('this is result of running from initial prediction')
bst = xgb.train( param, dtrain, 1, evallist )

View File

@ -1,126 +0,0 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -1,222 +0,0 @@
# depends on matrix
succ <- require("Matrix")
if (!succ) {
stop("xgboost depends on Matrix library")
}
# load in library
dyn.load("./libxgboostR.so")
# constructing DMatrix
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
} else if(is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
} else if(class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
} else {
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
}
dmat <- structure(handle, class="xgb.DMatrix")
if (length(info) != 0) {
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
}
return(dmat)
}
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
return(ret)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
return(TRUE)
}
stop(pase("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist)
.Call("XGBoosterSetParam_R", handle, "seed", "0")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) != "character"){
stop("xgb.Booster: modelfile must be character");
}
.Call("XGBoosterLoadModel_R", handle, modelfile)
}
return(structure(handle, class="xgb.Booster"))
}
# train a model using given parameters
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list");
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
}
bst <- xgb.Booster(params, append(watchlist,dtrain))
for (i in 1:nrounds) {
if (is.null(obj)) {
succ <- xgb.iter.update(bst, dtrain, i-1)
} else {
pred <- xgb.predict(bst, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair)
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i-1)
cat(msg); cat("\n")
} else {
cat("["); cat(i); cat("]");
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
cat(":"); cat(ret$value)
}
cat("\n")
}
}
}
return(bst)
}
# save model or DMatrix to file
xgb.save <- function(handle, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character");
}
if (class(handle) == "xgb.Booster") {
.Call("XGBoosterSaveModel_R", handle, fname);
return(TRUE)
}
if (class(handle) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE))
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE)
}
# predict
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.predict: first argument must be type xgb.Booster")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix")
}
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
return(ret)
}
# dump model
xgb.dump <- function(booster, fname, fmap = "") {
if (class(booster) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster")
}
if (typeof(fname) != "character"){
stop("xgb.dump: second argument must be type character")
}
.Call("XGBoosterDumpModel_R", booster, fname, fmap)
return(TRUE)
}
##--------------------------------------
# the following are low level iteratively function, not needed
# if you do not want to use them
#---------------------------------------
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain)
return(TRUE)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess)
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
evnames <- list()
if (length(watchlist) != 0) {
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
return(msg)
}

Some files were not shown because too many files have changed in this diff Show More