diff --git a/.gitignore b/.gitignore index 4551c79cc..220fc602a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,15 @@ # Compiled Dynamic libraries *.so *.dylib - +*.page # Compiled Static libraries *.lai *.la *.a *~ +*.Rcheck +*.rds +*.tar.gz *txt* *conf *buffer diff --git a/Makefile b/Makefile index 5599f3ab4..28a289ac6 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ BIN = xgboost OBJ = updater.o gbm.o io.o SLIB = wrapper/libxgboostwrapper.so -.PHONY: clean all python +.PHONY: clean all python Rpack all: $(BIN) $(OBJ) $(SLIB) @@ -40,19 +40,25 @@ $(OBJ) : install: cp -f -r $(BIN) $(INSTALL_PATH) -R-package.tar.gz: - rm -rf xgboost-R - cp -r R-package xgboost-R - rm -rf xgboost-R/src/*.o xgboost-R/src/*.so xgboost-R/src/*.dll - cp -r src xgboost-R/src/src - mkdir xgboost-R/src/wrapper - cp wrapper/xgboost_wrapper.h xgboost-R/src/wrapper - cp wrapper/xgboost_wrapper.cpp xgboost-R/src/wrapper - cp ./LICENSE xgboost-R - cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost-R/src/Makevars - cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost-R/src/Makevars.win - tar czf $@ xgboost-R - rm -rf xgboost-R +Rpack: + make clean + rm -rf xgboost xgboost*.tar.gz + cp -r R-package xgboost + rm -rf xgboost/inst/examples/*.buffer + rm -rf xgboost/inst/examples/*.model + rm -rf xgboost/inst/examples/dump* + rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll + rm -rf xgboost/demo/*.model xgboost/demo/*.buffer + cp -r src xgboost/src/src + mkdir xgboost/src/wrapper + cp wrapper/xgboost_wrapper.h xgboost/src/wrapper + cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper + cp ./LICENSE xgboost + cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars + cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win + R CMD build xgboost + rm -rf xgboost + R CMD check --as-cran xgboost*.tar.gz clean: - $(RM) $(OBJ) $(BIN) $(SLIB) *.o *~ */*~ */*/*~ + $(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~ diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 7d60143bd..40705e317 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,12 +1,20 @@ Package: xgboost Type: Package -Title: R wrapper of xgboost +Title: eXtreme Gradient Boosting Version: 0.3-0 Date: 2014-08-23 -Author: Tianqi Chen, Tong He -Maintainer: Tianqi Chen , Tong He -Description: xgboost -License: file LICENSE +Author: Tianqi Chen , Tong He +Maintainer: Tong He +Description: This package is a R wrapper of xgboost, which is short for eXtreme + Gradient Boosting. It is an efficient and scalable implementation of + gradient boosting framework. The package includes efficient linear model + solver and tree learning algorithm. The package can automatically do + parallel computation with OpenMP, and it can be more than 10 times faster + than existing gradient boosting packages such as gbm. It supports various + objective functions, including regression, classification and ranking. The + package is made to be extensible, so that user are also allowed to define + their own objectives easily. +License: Apache License (== 2.0) | file LICENSE URL: https://github.com/tqchen/xgboost BugReports: https://github.com/tqchen/xgboost/issues Depends: diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 3fc74663e..4a7cb9465 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -10,5 +10,6 @@ export(xgb.save) export(xgb.train) export(xgboost) exportMethods(predict) +import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R index 5b438049c..3a79fd2fb 100644 --- a/R-package/R/getinfo.xgb.DMatrix.R +++ b/R-package/R/getinfo.xgb.DMatrix.R @@ -4,20 +4,23 @@ setClass('xgb.DMatrix') #' #' Get information of an xgb.DMatrix object #' -#' @param object Object of class "xgb.DMatrix" -#' @param name the name of the field to get -#' #' @examples #' data(iris) #' iris[,5] <- as.numeric(iris[,5]) #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' labels <- getinfo(dtrain, "label") +#' @rdname getinfo #' @export #' getinfo <- function(object, ...){ UseMethod("getinfo") } +#' @param object Object of class "xgb.DMatrix" +#' @param name the name of the field to get +#' @param ... other parameters +#' @rdname getinfo +#' @method getinfo xgb.DMatrix setMethod("getinfo", signature = "xgb.DMatrix", definition = function(object, name) { if (typeof(name) != "character") { diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index b51a1b19c..390ac689e 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -11,7 +11,8 @@ setClass("xgb.Booster") #' value of sum of functions, when outputmargin=TRUE, the prediction is #' untransformed margin value. In logistic regression, outputmargin=T will #' output value before logistic transformation. -#' +#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear. +#' set it to be value bigger than 0. It will use all trees by default. #' @examples #' data(iris) #' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) @@ -19,11 +20,18 @@ setClass("xgb.Booster") #' @export #' setMethod("predict", signature = "xgb.Booster", - definition = function(object, newdata, outputmargin = FALSE) { + definition = function(object, newdata, outputmargin = FALSE, ntreelimit = NULL) { if (class(newdata) != "xgb.DMatrix") { newdata <- xgb.DMatrix(newdata) } - ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), PACKAGE = "xgboost") + if (is.null(ntreelimit)) { + ntreelimit <- 0 + } else { + if (ntreelimit < 1){ + stop("predict: ntreelimit must be equal to or greater than 1") + } + } + ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost") return(ret) }) diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index 0c56829fa..8a93efc4d 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -6,22 +6,25 @@ setClass('xgb.DMatrix') #' Get a new DMatrix containing the specified rows of #' orginal xgb.DMatrix object #' -#' @param object Object of class "xgb.DMatrix" -#' @param idxset a integer vector of indices of rows needed -#' #' @examples #' data(iris) #' iris[,5] <- as.numeric(iris[,5]) #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' dsub <- slice(dtrain, 1:3) +#' @rdname slice #' @export #' slice <- function(object, ...){ UseMethod("slice") } +#' @param object Object of class "xgb.DMatrix" +#' @param idxset a integer vector of indices of rows needed +#' @param ... other parameters +#' @rdname slice +#' @method slice xgb.DMatrix setMethod("slice", signature = "xgb.DMatrix", - definition = function(object, idxset) { + definition = function(object, idxset, ...) { if (class(object) != "xgb.DMatrix") { stop("slice: first argument dtrain must be xgb.DMatrix") } diff --git a/R-package/R/utils.R b/R-package/R/utils.R index b3fb39748..da602478a 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -1,4 +1,5 @@ #' @importClassesFrom Matrix dgCMatrix dgeMatrix +#' @import methods # depends on matrix .onLoad <- function(libname, pkgname) { @@ -48,7 +49,6 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { } } handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") - .Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE = "xgboost") if (length(params) != 0) { for (i in 1:length(params)) { p <- params[i] @@ -121,8 +121,8 @@ xgb.iter.eval <- function(booster, watchlist, iter) { stop("xgb.eval: watch list can only contain xgb.DMatrix") } } - evnames <- list() if (length(watchlist) != 0) { + evnames <- list() for (i in 1:length(watchlist)) { w <- watchlist[i] if (length(names(w)) == 0) { @@ -130,8 +130,10 @@ xgb.iter.eval <- function(booster, watchlist, iter) { } evnames <- append(evnames, names(w)) } + msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, + evnames, PACKAGE = "xgboost") + } else { + msg <- "" } - msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, - evnames, PACKAGE = "xgboost") return(msg) } diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R index b108c2dad..4fcb71301 100644 --- a/R-package/R/xgb.DMatrix.save.R +++ b/R-package/R/xgb.DMatrix.save.R @@ -2,7 +2,7 @@ #' #' Save xgb.DMatrix object to binary file #' -#' @param model the model object. +#' @param DMatrix the model object. #' @param fname the name of the binary file. #' #' @examples diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 2c7813712..09406dc99 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -4,9 +4,12 @@ #' #' @param model the model object. #' @param fname the name of the binary file. -#' @param fmap feature map file representing the type of feature, to make it -#' look nice, run demo/demo.R for result and demo/featmap.txt for example -#' Format: https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model +#' @param fmap feature map file representing the type of feature. +#' Detailed description could be found at +#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}. +#' Run inst/examples/demo.R for the result and inst/examples/featmap.txt +#' for example Format. +#' #' #' @examples #' data(iris) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index ceb87c1cb..58a575d03 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -15,7 +15,7 @@ #' } #' #' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for -#' further details. See also demo/demo.R for walkthrough example in R. +#' further details. See also inst/examples/demo.R for walkthrough example in R. #' @param dtrain takes an \code{xgb.DMatrix} as the input. #' @param nrounds the max number of iterations #' @param watchlist what information should be printed when \code{verbose=1} or @@ -24,10 +24,11 @@ #' watchlist=list(validation1=mat1, validation2=mat2) to watch #' the performance of each round's model on mat1 and mat2 #' -#' @param obj customized objective function. Given prediction and dtrain, -#' return gradient and second order gradient. -#' @param feval custimized evaluation function. Given prediction and dtrain, -#' return a \code{list(metric='metric-name', value='metric-value')}. +#' @param obj customized objective function. Returns gradient and second order +#' gradient with given prediction and dtrain, +#' @param feval custimized evaluation function. Returns +#' \code{list(metric='metric-name', value='metric-value')} with given +#' prediction and dtrain, #' @param ... other parameters to pass to \code{params}. #' #' @details diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index c6fc99980..6f4633fb8 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -19,7 +19,7 @@ #' } #' #' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for -#' further details. See also demo/demo.R for walkthrough example in R. +#' further details. See also inst/examples/demo.R for walkthrough example in R. #' @param nrounds the max number of iterations #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both diff --git a/R-package/README.md b/R-package/README.md index 0fcf04981..60df71154 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -1,10 +1,21 @@ -This is subfolder for experimental version of R package. +# R package for xgboost. -Installation: +## Installation + +For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. ```r require(devtools) install_github('xgboost','tqchen',subdir='R-package') ``` -Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/demo/demo.R) for more details. +For stable version on CRAN, please run + +```r +install.packages('xgboost') +``` + +## Examples + +* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example. +* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset. diff --git a/R-package/demo/00Index b/R-package/demo/00Index deleted file mode 100644 index 2ca4abd32..000000000 --- a/R-package/demo/00Index +++ /dev/null @@ -1 +0,0 @@ -demo R code for xgboost usages on agaricus data diff --git a/R-package/demo/agaricus.txt.test b/R-package/inst/examples/agaricus.txt.test similarity index 100% rename from R-package/demo/agaricus.txt.test rename to R-package/inst/examples/agaricus.txt.test diff --git a/R-package/demo/agaricus.txt.train b/R-package/inst/examples/agaricus.txt.train similarity index 100% rename from R-package/demo/agaricus.txt.train rename to R-package/inst/examples/agaricus.txt.train diff --git a/R-package/demo/demo.R b/R-package/inst/examples/demo.R similarity index 100% rename from R-package/demo/demo.R rename to R-package/inst/examples/demo.R diff --git a/R-package/demo/featmap.txt b/R-package/inst/examples/featmap.txt similarity index 100% rename from R-package/demo/featmap.txt rename to R-package/inst/examples/featmap.txt diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 4f63b5e92..05a25c152 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,14 +1,20 @@ % Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} \name{getinfo} \alias{getinfo} +\alias{getinfo,xgb.DMatrix-method} \title{Get information of an xgb.DMatrix object} \usage{ getinfo(object, ...) + +\S4method{getinfo}{xgb.DMatrix}(object, name) } \arguments{ \item{object}{Object of class "xgb.DMatrix"} \item{name}{the name of the field to get} + +\item{...}{other parameters} } \description{ Get information of an xgb.DMatrix object diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index d43fd7362..d192997d2 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -4,7 +4,8 @@ \alias{predict,xgb.Booster-method} \title{Predict method for eXtreme Gradient Boosting model} \usage{ -\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE) +\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE, + ntreelimit = NULL) } \arguments{ \item{object}{Object of class "xgb.Boost"} @@ -13,9 +14,12 @@ \code{xgb.DMatrix}.} \item{outputmargin}{whether the prediction should be shown in the original - value of sum of functions, when outputmargin=TRUE, the prediction is - untransformed margin value. In logistic regression, outputmargin=T will - output value before logistic transformation.} +value of sum of functions, when outputmargin=TRUE, the prediction is +untransformed margin value. In logistic regression, outputmargin=T will +output value before logistic transformation.} + +\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear. +set it to be value bigger than 0. It will use all trees by default.} } \description{ Predicted values based on xgboost model object. diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 06d79f6c4..7acb14a32 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -1,15 +1,21 @@ % Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} \name{slice} \alias{slice} +\alias{slice,xgb.DMatrix-method} \title{Get a new DMatrix containing the specified rows of orginal xgb.DMatrix object} \usage{ slice(object, ...) + +\S4method{slice}{xgb.DMatrix}(object, idxset, ...) } \arguments{ \item{object}{Object of class "xgb.DMatrix"} \item{idxset}{a integer vector of indices of rows needed} + +\item{...}{other parameters} } \description{ Get a new DMatrix containing the specified rows of diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index d4932fa42..e5e70501d 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -6,7 +6,7 @@ xgb.DMatrix.save(DMatrix, fname) } \arguments{ -\item{model}{the model object.} +\item{DMatrix}{the model object.} \item{fname}{the name of the binary file.} } diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 1e0360b31..4d6933811 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -10,9 +10,11 @@ xgb.dump(model, fname, fmap = "") \item{fname}{the name of the binary file.} -\item{fmap}{feature map file representing the type of feature, to make it - look nice, run demo/demo.R for result and demo/featmap.txt for example - Format: https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model} +\item{fmap}{feature map file representing the type of feature. + Detailed description could be found at + \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}. + Run inst/examples/demo.R for the result and inst/examples/featmap.txt + for example Format.} } \description{ Save a xgboost model to text file. Could be parsed later. diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 1f29afa04..4da3b0013 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -20,7 +20,7 @@ xgb.train(params = list(), dtrain, nrounds, watchlist = list(), } See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for - further details. See also demo/demo.R for walkthrough example in R.} + further details. See also inst/examples/demo.R for walkthrough example in R.} \item{dtrain}{takes an \code{xgb.DMatrix} as the input.} @@ -32,11 +32,12 @@ xgb.train(params = list(), dtrain, nrounds, watchlist = list(), watchlist=list(validation1=mat1, validation2=mat2) to watch the performance of each round's model on mat1 and mat2} -\item{obj}{customized objective function. Given prediction and dtrain, -return gradient and second order gradient.} +\item{obj}{customized objective function. Returns gradient and second order +gradient with given prediction and dtrain,} -\item{feval}{custimized evaluation function. Given prediction and dtrain, -return a \code{list(metric='metric-name', value='metric-value')}.} +\item{feval}{custimized evaluation function. Returns +\code{list(metric='metric-name', value='metric-value')} with given +prediction and dtrain,} \item{...}{other parameters to pass to \code{params}.} } diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index a76ce5b3d..2b6c1a124 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -25,7 +25,7 @@ xgboost(data = NULL, label = NULL, params = list(), nrounds, } See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for - further details. See also demo/demo.R for walkthrough example in R.} + further details. See also inst/examples/demo.R for walkthrough example in R.} \item{nrounds}{the max number of iterations} diff --git a/R-package/src/Makevars b/R-package/src/Makevars index 3539a913d..289f1a15a 100644 --- a/R-package/src/Makevars +++ b/R-package/src/Makevars @@ -1,32 +1,7 @@ # package root PKGROOT=../../ # _*_ mode: Makefile; _*_ -CXX=`R CMD config CXX` -TCFLAGS=`R CMD config CFLAGS` -# expose these flags to R CMD SHLIB -PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) -PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) -XGBFLAG= $(TCFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) - -ifeq ($(no_omp),1) - PKG_CPPFLAGS += -DDISABLE_OPENMP -endif - -CXXOBJ= xgboost_wrapper.o xgboost_io.o xgboost_gbm.o xgboost_updater.o -OBJECTS= xgboost_R.o $(CXXOBJ) - -.PHONY: all clean -all: $(SHLIB) -$(SHLIB): $(OBJECTS) - -xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp -xgboost_io.o: $(PKGROOT)/src/io/io.cpp -xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp -xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp - -$(CXXOBJ) : - $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) ) - -clean: - rm -rf *.so *.o *~ *.dll - +PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT) +PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) +OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index ae599fbf3..289f1a15a 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -1,33 +1,7 @@ # package root PKGROOT=../../ # _*_ mode: Makefile; _*_ -CXX=`Rcmd config CXX` -TCFLAGS=`Rcmd config CFLAGS` -# expose these flags to R CMD SHLIB -PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) -PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) -XGBFLAG= -O3 -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) - -ifeq ($(no_omp),1) - PKG_CPPFLAGS += -DDISABLE_OPENMP -endif - -CXXOBJ= xgboost_wrapper.o xgboost_io.o xgboost_gbm.o xgboost_updater.o -OBJECTS= xgboost_R.o $(CXXOBJ) - -.PHONY: all clean -all: $(SHLIB) -$(SHLIB): $(OBJECTS) - -xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp -xgboost_io.o: $(PKGROOT)/src/io/io.cpp -xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp -xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp - -$(CXXOBJ) : - $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) ) - -clean: - rm -rf *.so *.o *~ *.dll - +PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT) +PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) +OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index b358ef4ae..a7753dfa5 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -2,25 +2,55 @@ #include #include #include +#include #include "xgboost_R.h" #include "wrapper/xgboost_wrapper.h" #include "src/utils/utils.h" #include "src/utils/omp.h" #include "src/utils/matrix_csr.h" - +using namespace std; using namespace xgboost; + +extern "C" { + void XGBoostAssert_R(int exp, const char *fmt, ...); + void XGBoostCheck_R(int exp, const char *fmt, ...); + int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...); +} + // implements error handling namespace xgboost { namespace utils { -void HandleAssertError(const char *msg) { - error("%s", msg); -} -void HandleCheckError(const char *msg) { - error("%s", msg); +extern "C" { + void (*Printf)(const char *fmt, ...) = Rprintf; + int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R; + void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R; + void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R; + void (*Error)(const char *fmt, ...) = error; } } // namespace utils + +namespace random { +void Seed(unsigned seed) { + warning("parameter seed is ignored, please set random seed using set.seed"); +} +double Uniform(void) { + return unif_rand(); +} +double Normal(void) { + return norm_rand(); +} +} // namespace random } // namespace xgboost +// call before wrapper starts +inline void _WrapperBegin(void) { + GetRNGstate(); +} +// call after wrapper starts +inline void _WrapperEnd(void) { + PutRNGstate(); +} + extern "C" { void _DMatrixFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; @@ -28,14 +58,17 @@ extern "C" { R_ClearExternalPtr(ext); } SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { + _WrapperBegin(); void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); + _WrapperEnd(); return ret; } SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing) { + _WrapperBegin(); SEXP dim = getAttrib(mat, R_DimSymbol); int nrow = INTEGER(dim)[0]; int ncol = INTEGER(dim)[1]; @@ -47,15 +80,17 @@ extern "C" { data[i * ncol +j] = din[i + nrow * j]; } } - void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing)); + void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); + _WrapperEnd(); return ret; } SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data) { + _WrapperBegin(); const int *col_ptr = INTEGER(indptr); const int *row_index = INTEGER(indices); const double *col_data = REAL(data); @@ -85,29 +120,36 @@ extern "C" { col_index[i] = csr_data[i].first; row_data[i] = csr_data[i].second; } - void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata ); + void *handle = XGDMatrixCreateFromCSR(BeginPtr(row_ptr), BeginPtr(col_index), + BeginPtr(row_data), row_ptr.size(), ndata ); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); + _WrapperEnd(); return ret; } SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { + _WrapperBegin(); int len = length(idxset); std::vector idxvec(len); for (int i = 0; i < len; ++i) { idxvec[i] = INTEGER(idxset)[i] - 1; } - void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), &idxvec[0], len); + void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len); SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); + _WrapperEnd(); return ret; } void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { + _WrapperBegin(); XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), CHAR(asChar(fname)), asInteger(silent)); + _WrapperEnd(); } void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) { + _WrapperBegin(); int len = length(array); const char *name = CHAR(asChar(field)); if (!strcmp("group", name)) { @@ -116,7 +158,8 @@ extern "C" { for (int i = 0; i < len; ++i) { vec[i] = static_cast(INTEGER(array)[i]); } - XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len); + XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len); + _WrapperEnd(); return; } { @@ -127,10 +170,12 @@ extern "C" { } XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), - &vec[0], len); + BeginPtr(vec), len); } + _WrapperEnd(); } SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) { + _WrapperBegin(); bst_ulong olen; const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen); @@ -139,6 +184,7 @@ extern "C" { REAL(ret)[i] = res[i]; } UNPROTECT(1); + _WrapperEnd(); return ret; } // functions related to booster @@ -148,28 +194,35 @@ extern "C" { R_ClearExternalPtr(ext); } SEXP XGBoosterCreate_R(SEXP dmats) { + _WrapperBegin(); int len = length(dmats); std::vector dvec; for (int i = 0; i < len; ++i){ dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i))); } - void *handle = XGBoosterCreate(&dvec[0], dvec.size()); + void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size()); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); UNPROTECT(1); + _WrapperEnd(); return ret; } void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { + _WrapperBegin(); XGBoosterSetParam(R_ExternalPtrAddr(handle), CHAR(asChar(name)), CHAR(asChar(val))); + _WrapperEnd(); } void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) { + _WrapperBegin(); XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), asInteger(iter), R_ExternalPtrAddr(dtrain)); + _WrapperEnd(); } void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) { + _WrapperBegin(); utils::Check(length(grad) == length(hess), "gradient and hess must have same length"); int len = length(grad); std::vector tgrad(len), thess(len); @@ -180,9 +233,11 @@ extern "C" { } XGBoosterBoostOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain), - &tgrad[0], &thess[0], len); + BeginPtr(tgrad), BeginPtr(thess), len); + _WrapperEnd(); } SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) { + _WrapperBegin(); utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length"); int len = length(dmats); std::vector vec_dmats; @@ -197,28 +252,37 @@ extern "C" { } return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), asInteger(iter), - &vec_dmats[0], &vec_sptr[0], len)); + BeginPtr(vec_dmats), BeginPtr(vec_sptr), len)); + _WrapperEnd(); } - SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) { + SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) { + _WrapperBegin(); bst_ulong olen; const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dmat), asInteger(output_margin), + asInteger(ntree_limit), &olen); SEXP ret = PROTECT(allocVector(REALSXP, olen)); for (size_t i = 0; i < olen; ++i) { REAL(ret)[i] = res[i]; } UNPROTECT(1); + _WrapperEnd(); return ret; } void XGBoosterLoadModel_R(SEXP handle, SEXP fname) { + _WrapperBegin(); XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); + _WrapperEnd(); } void XGBoosterSaveModel_R(SEXP handle, SEXP fname) { + _WrapperBegin(); XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); + _WrapperEnd(); } void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) { + _WrapperBegin(); bst_ulong olen; const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), CHAR(asChar(fmap)), @@ -229,5 +293,6 @@ extern "C" { fprintf(fo, "%s", res[i]); } fclose(fo); + _WrapperEnd(); } } diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index ecacdeced..c988ff1e5 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -7,6 +7,7 @@ */ extern "C" { #include +#include } extern "C" { @@ -106,8 +107,9 @@ extern "C" { * \param handle handle * \param dmat data matrix * \param output_margin whether only output raw margin value + * \param ntree_limit limit number of trees used in prediction */ - SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin); + SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit); /*! * \brief load model from existing file * \param handle handle diff --git a/R-package/src/xgboost_assert.c b/R-package/src/xgboost_assert.c new file mode 100644 index 000000000..20b789492 --- /dev/null +++ b/R-package/src/xgboost_assert.c @@ -0,0 +1,33 @@ +#include +#include +#include + +// implements error handling +void XGBoostAssert_R(int exp, const char *fmt, ...) { + char buf[1024]; + if (exp == 0) { + va_list args; + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); + error("AssertError:%s\n", buf); + } +} +void XGBoostCheck_R(int exp, const char *fmt, ...) { + char buf[1024]; + if (exp == 0) { + va_list args; + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); + error("%s\n", buf); + } +} +int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) { + int ret; + va_list args; + va_start(args, fmt); + ret = vsnprintf(buf, size, fmt, args); + va_end(args); + return ret; +} diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw index ed4447d57..19254abaf 100644 --- a/R-package/vignettes/xgboost.Rnw +++ b/R-package/vignettes/xgboost.Rnw @@ -173,7 +173,7 @@ objective function. We also have \verb@slice@ for row extraction. It is useful in cross-validation. -For a walkthrough demo, please see \verb@R-package/demo/demo.R@ for further +For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further details. \section{The Higgs Boson competition} diff --git a/README.md b/README.md index ba4b08bfd..38291b09d 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,11 @@ Version ====== * This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility * This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure - - This means the model and buffer file of previous version can not be loaded in xgboost-unity + - This means the model and buffer file of previous version can not be loaded in xgboost-3.0 * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22) * Change log in [CHANGES.md](CHANGES.md) XGBoost in Graphlab Create ====== -* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of big data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html +* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html * Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp index e9566f87e..a9d4c8d62 100644 --- a/src/gbm/gblinear-inl.hpp +++ b/src/gbm/gblinear-inl.hpp @@ -105,7 +105,10 @@ class GBLinear : public IGradBooster { virtual void Predict(IFMatrix *p_fmat, int64_t buffer_offset, const BoosterInfo &info, - std::vector *out_preds) { + std::vector *out_preds, + unsigned ntree_limit = 0) { + utils::Check(ntree_limit == 0, + "GBLinear::Predict ntrees is only valid for gbtree predictor"); std::vector &preds = *out_preds; preds.resize(0); // start collecting the prediction diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp index ae0e4af94..4713838e9 100644 --- a/src/gbm/gbm.cpp +++ b/src/gbm/gbm.cpp @@ -1,6 +1,7 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include +using namespace std; #include "./gbm.h" #include "./gbtree-inl.hpp" #include "./gblinear-inl.hpp" diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h index c548cab94..07dade4ac 100644 --- a/src/gbm/gbm.h +++ b/src/gbm/gbm.h @@ -57,11 +57,14 @@ class IGradBooster { * the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size") * \param info extra side information that may be needed for prediction * \param out_preds output vector to hold the predictions + * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means + * we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear */ virtual void Predict(IFMatrix *p_fmat, int64_t buffer_offset, const BoosterInfo &info, - std::vector *out_preds) = 0; + std::vector *out_preds, + unsigned ntree_limit = 0) = 0; /*! * \brief dump the model in text format * \param fmap feature map that may help give interpretations of feature diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index f66b49d00..8fea28727 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -105,7 +105,8 @@ class GBTree : public IGradBooster { virtual void Predict(IFMatrix *p_fmat, int64_t buffer_offset, const BoosterInfo &info, - std::vector *out_preds) { + std::vector *out_preds, + unsigned ntree_limit = 0) { int nthread; #pragma omp parallel { @@ -137,7 +138,8 @@ class GBTree : public IGradBooster { this->Pred(batch[i], buffer_offset < 0 ? -1 : buffer_offset + ridx, gid, info.GetRoot(ridx), &feats, - &preds[ridx * mparam.num_output_group + gid], stride); + &preds[ridx * mparam.num_output_group + gid], stride, + ntree_limit); } } } @@ -212,14 +214,16 @@ class GBTree : public IGradBooster { int bst_group, unsigned root_index, tree::RegTree::FVec *p_feats, - float *out_pred, size_t stride) { + float *out_pred, size_t stride, unsigned ntree_limit) { size_t itop = 0; float psum = 0.0f; // sum of leaf vector std::vector vec_psum(mparam.size_leaf_vector, 0.0f); const int64_t bid = mparam.BufferOffset(buffer_index, bst_group); + // number of valid trees + unsigned treeleft = ntree_limit == 0 ? std::numeric_limits::max() : ntree_limit; // load buffered results if any - if (bid >= 0) { + if (bid >= 0 && ntree_limit == 0) { itop = pred_counter[bid]; psum = pred_buffer[bid]; for (int i = 0; i < mparam.size_leaf_vector; ++i) { @@ -235,12 +239,13 @@ class GBTree : public IGradBooster { for (int j = 0; j < mparam.size_leaf_vector; ++j) { vec_psum[j] += trees[i]->leafvec(tid)[j]; } + if(--treeleft == 0) break; } } p_feats->Drop(inst); } // updated the buffered results - if (bid >= 0) { + if (bid >= 0 && ntree_limit == 0) { pred_counter[bid] = static_cast(trees.size()); pred_buffer[bid] = psum; for (int i = 0; i < mparam.size_leaf_vector; ++i) { diff --git a/src/io/io.cpp b/src/io/io.cpp index e413b2799..c2d9e26d3 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -1,6 +1,7 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include +using namespace std; #include "./io.h" #include "../utils/io.h" #include "../utils/utils.h" diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index bd18f0476..59d5093d7 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -54,8 +54,10 @@ class DMatrixSimple : public DataMatrix { for (size_t i = 0; i < batch.size; ++i) { RowBatch::Inst inst = batch[i]; row_data_.resize(row_data_.size() + inst.length); - memcpy(&row_data_[row_ptr_.back()], inst.data, - sizeof(RowBatch::Entry) * inst.length); + if (inst.length != 0) { + memcpy(&row_data_[row_ptr_.back()], inst.data, + sizeof(RowBatch::Entry) * inst.length); + } row_ptr_.push_back(row_ptr_.back() + inst.length); } } @@ -104,10 +106,10 @@ class DMatrixSimple : public DataMatrix { this->AddRow(feats); if (!silent) { - printf("%lux%lu matrix with %lu entries is loaded from %s\n", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size()), fname); + utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n", + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size()), fname); } fclose(file); // try to load in additional file @@ -147,26 +149,26 @@ class DMatrixSimple : public DataMatrix { * \param fname file name, used to print message */ inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { - int magic; - utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); - utils::Check(magic == kMagic, "invalid format,magic number mismatch"); + int tmagic; + utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); + utils::Check(tmagic == kMagic, "invalid format,magic number mismatch"); info.LoadBinary(fs); FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); fmat_->LoadColAccess(fs); if (!silent) { - printf("%lux%lu matrix with %lu entries is loaded", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size())); + utils::Printf("%lux%lu matrix with %lu entries is loaded", + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size())); if (fname != NULL) { - printf(" from %s\n", fname); + utils::Printf(" from %s\n", fname); } else { - printf("\n"); + utils::Printf("\n"); } if (info.group_ptr.size() != 0) { - printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1); + utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1); } } } @@ -177,8 +179,8 @@ class DMatrixSimple : public DataMatrix { */ inline void SaveBinary(const char* fname, bool silent = false) const { utils::FileStream fs(utils::FopenCheck(fname, "wb")); - int magic = kMagic; - fs.Write(&magic, sizeof(magic)); + int tmagic = kMagic; + fs.Write(&tmagic, sizeof(tmagic)); info.SaveBinary(fs); FMatrixS::SaveBinary(fs, row_ptr_, row_data_); @@ -186,13 +188,13 @@ class DMatrixSimple : public DataMatrix { fs.Close(); if (!silent) { - printf("%lux%lu matrix with %lu entries is saved to %s\n", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size()), fname); + utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n", + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size()), fname); if (info.group_ptr.size() != 0) { - printf("data contains %u groups\n", - static_cast(info.group_ptr.size()-1)); + utils::Printf("data contains %u groups\n", + static_cast(info.group_ptr.size()-1)); } } } @@ -244,8 +246,8 @@ class DMatrixSimple : public DataMatrix { at_first_ = false; batch_.size = parent_->row_ptr_.size() - 1; batch_.base_rowid = 0; - batch_.ind_ptr = &parent_->row_ptr_[0]; - batch_.data_ptr = &parent_->row_data_[0]; + batch_.ind_ptr = BeginPtr(parent_->row_ptr_); + batch_.data_ptr = BeginPtr(parent_->row_data_); return true; } virtual const RowBatch &Value(void) const { diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index f099eb1a9..997268ff3 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -110,9 +110,9 @@ class FMatrixS : public IFMatrix{ const std::vector &data) { size_t nrow = ptr.size() - 1; fo.Write(&nrow, sizeof(size_t)); - fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); + fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t)); if (data.size() != 0) { - fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry)); + fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry)); } } /*! @@ -127,11 +127,11 @@ class FMatrixS : public IFMatrix{ size_t nrow; utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); out_ptr->resize(nrow + 1); - utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0, + utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0, "invalid input file format"); out_data->resize(out_ptr->back()); if (out_data->size() != 0) { - utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0, + utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0, "invalid input file format"); } } @@ -213,8 +213,8 @@ class FMatrixS : public IFMatrix{ col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx], static_cast(ptr[ridx+1] - ptr[ridx])); } - batch_.col_index = &col_index_[0]; - batch_.col_data = &col_data_[0]; + batch_.col_index = BeginPtr(col_index_); + batch_.col_data = BeginPtr(col_data_); this->BeforeFirst(); } // data content diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index 542b6f6f5..bef84900a 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -98,8 +98,8 @@ struct MetaInfo { group_ptr.push_back(group_ptr.back()+nline); } if (!silent) { - printf("%u groups are loaded from %s\n", - static_cast(group_ptr.size()-1), fname); + utils::Printf("%u groups are loaded from %s\n", + static_cast(group_ptr.size()-1), fname); } fclose(fi); return true; @@ -125,15 +125,15 @@ struct MetaInfo { } // try to load weight information from file, if exists inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) { - std::vector &weights = this->GetFloatInfo(field); + std::vector &data = this->GetFloatInfo(field); FILE *fi = fopen64(fname, "r"); if (fi == NULL) return false; float wt; while (fscanf(fi, "%f", &wt) == 1) { - weights.push_back(wt); + data.push_back(wt); } if (!silent) { - printf("loading %s from %s\n", field, fname); + utils::Printf("loading %s from %s\n", field, fname); } fclose(fi); return true; diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 3058cf06b..52877e17b 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include "./evaluation.h" #include "./helper_utils.h" @@ -183,7 +183,7 @@ struct EvalAMS : public IEvaluator { } } if (ntop == ndata) { - fprintf(stderr, "\tams-ratio=%g", static_cast(thresindex) / ndata); + utils::Printf("\tams-ratio=%g", static_cast(thresindex) / ndata); return static_cast(tams); } else { return static_cast(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp))); diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index 90f4a5839..ec37e1f4a 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -73,7 +73,7 @@ class EvalSet{ for (size_t i = 0; i < evals_.size(); ++i) { float res = evals_[i]->Eval(preds, info); char tmp[1024]; - snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res); + utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res); result += tmp; } return result; diff --git a/src/learner/helper_utils.h b/src/learner/helper_utils.h index e2f8a3574..aa1e66bbc 100644 --- a/src/learner/helper_utils.h +++ b/src/learner/helper_utils.h @@ -7,6 +7,7 @@ */ #include #include +#include #include namespace xgboost { namespace learner { diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 8e7bce0a8..5d7c9d06a 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -63,14 +63,14 @@ class BoostLearner { } char str_temp[25]; if (num_feature > mparam.num_feature) { - snprintf(str_temp, sizeof(str_temp), "%u", num_feature); + utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature); this->SetParam("bst:num_feature", str_temp); } - snprintf(str_temp, sizeof(str_temp), "%lu", + utils::SPrintf(str_temp, sizeof(str_temp), "%lu", static_cast(buffer_size)); this->SetParam("num_pbuffer", str_temp); if (!silent) { - printf("buffer_size=%ld\n", static_cast(buffer_size)); + utils::Printf("buffer_size=%ld\n", static_cast(buffer_size)); } } /*! @@ -183,7 +183,7 @@ class BoostLearner { const std::vector &evname) { std::string res; char tmp[256]; - snprintf(tmp, sizeof(tmp), "[%d]", iter); + utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter); res = tmp; for (size_t i = 0; i < evals.size(); ++i) { this->PredictRaw(*evals[i], &preds_); @@ -212,11 +212,14 @@ class BoostLearner { * \param data input data * \param output_margin whether to only predict margin value instead of transformed prediction * \param out_preds output vector that stores the prediction + * \param ntree_limit limit number of trees used for boosted tree + * predictor, when it equals 0, this means we are using all the trees */ inline void Predict(const DMatrix &data, bool output_margin, - std::vector *out_preds) const { - this->PredictRaw(data, out_preds); + std::vector *out_preds, + unsigned ntree_limit = 0) const { + this->PredictRaw(data, out_preds, ntree_limit); if (!output_margin) { obj_->PredTransform(out_preds); } @@ -246,11 +249,14 @@ class BoostLearner { * \brief get un-transformed prediction * \param data training data matrix * \param out_preds output vector that stores the prediction + * \param ntree_limit limit number of trees used for boosted tree + * predictor, when it equals 0, this means we are using all the trees */ inline void PredictRaw(const DMatrix &data, - std::vector *out_preds) const { + std::vector *out_preds, + unsigned ntree_limit = 0) const { gbm_->Predict(data.fmat(), this->FindBufferOffset(data), - data.info.info, out_preds); + data.info.info, out_preds, ntree_limit); // add base margin std::vector &preds = *out_preds; const bst_omp_uint ndata = static_cast(preds.size()); diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 9e338a6b2..576549eac 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -6,9 +6,9 @@ * \author Tianqi Chen, Kailong Chen */ #include -#include #include #include +#include #include #include "../data.h" #include "./objective.h" @@ -37,7 +37,7 @@ struct LossType { case kLogisticRaw: case kLinearSquare: return x; case kLogisticClassify: - case kLogisticNeglik: return 1.0f / (1.0f + expf(-x)); + case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x)); default: utils::Error("unknown loss_type"); return 0.0f; } } @@ -50,7 +50,7 @@ struct LossType { inline float FirstOrderGradient(float predt, float label) const { switch (loss_type) { case kLinearSquare: return predt - label; - case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt)); + case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt)); case kLogisticClassify: case kLogisticNeglik: return predt - label; default: utils::Error("unknown loss_type"); return 0.0f; @@ -65,7 +65,7 @@ struct LossType { inline float SecondOrderGradient(float predt, float label) const { switch (loss_type) { case kLinearSquare: return 1.0f; - case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt)); + case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt)); case kLogisticClassify: case kLogisticNeglik: return predt * (1 - predt); default: utils::Error("unknown loss_type"); return 0.0f; @@ -80,7 +80,7 @@ struct LossType { loss_type == kLogisticNeglik ) { utils::Check(base_score > 0.0f && base_score < 1.0f, "base_score must be in (0,1) for logistic loss"); - base_score = -logf(1.0f / base_score - 1.0f); + base_score = -std::log(1.0f / base_score - 1.0f); } return base_score; } @@ -419,8 +419,8 @@ class LambdaRankObjNDCG : public LambdaRankObj { for (size_t i = 0; i < pairs.size(); ++i) { unsigned pos_idx = pairs[i].pos_index; unsigned neg_idx = pairs[i].neg_index; - float pos_loginv = 1.0f / logf(pos_idx + 2.0f); - float neg_loginv = 1.0f / logf(neg_idx + 2.0f); + float pos_loginv = 1.0f / std::log(pos_idx + 2.0f); + float neg_loginv = 1.0f / std::log(neg_idx + 2.0f); int pos_label = static_cast(sorted_list[pos_idx].label); int neg_label = static_cast(sorted_list[neg_idx].label); float original = @@ -438,7 +438,7 @@ class LambdaRankObjNDCG : public LambdaRankObj { for (size_t i = 0; i < labels.size(); ++i) { const unsigned rel = static_cast(labels[i]); if (rel != 0) { - sumdcg += ((1 << rel) - 1) / logf(static_cast(i + 2)); + sumdcg += ((1 << rel) - 1) / std::log(static_cast(i + 2)); } } return static_cast(sumdcg); diff --git a/src/tree/param.h b/src/tree/param.h index 92bc1c990..4b0f154f8 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -302,11 +302,11 @@ struct SplitEntry{ * \param loss_chg the loss reduction get through the split * \param split_index the feature index where the split is on */ - inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const { + inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const { if (this->split_index() <= split_index) { - return loss_chg > this->loss_chg; + return new_loss_chg > this->loss_chg; } else { - return !(this->loss_chg > loss_chg); + return !(this->loss_chg > new_loss_chg); } } /*! @@ -332,13 +332,13 @@ struct SplitEntry{ * \param default_left whether the missing value goes to left * \return whether the proposed split is better and can replace current split */ - inline bool Update(bst_float loss_chg, unsigned split_index, - float split_value, bool default_left) { - if (this->NeedReplace(loss_chg, split_index)) { - this->loss_chg = loss_chg; + inline bool Update(bst_float new_loss_chg, unsigned split_index, + float new_split_value, bool default_left) { + if (this->NeedReplace(new_loss_chg, split_index)) { + this->loss_chg = new_loss_chg; if (default_left) split_index |= (1U << 31); this->sindex = split_index; - this->split_value = split_value; + this->split_value = new_split_value; return true; } else { return false; diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp index 25bee7922..09b63eb49 100644 --- a/src/tree/updater.cpp +++ b/src/tree/updater.cpp @@ -1,6 +1,7 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include +using namespace std; #include "./updater.h" #include "./updater_prune-inl.hpp" #include "./updater_refresh-inl.hpp" diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp index e3d5be11a..98fdf5ee4 100644 --- a/src/tree/updater_prune-inl.hpp +++ b/src/tree/updater_prune-inl.hpp @@ -63,8 +63,8 @@ class TreePruner: public IUpdater { } } if (silent == 0) { - printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", - tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth()); + utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", + tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth()); } } diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp index d184dcb39..a37630333 100644 --- a/src/tree/updater_refresh-inl.hpp +++ b/src/tree/updater_refresh-inl.hpp @@ -26,14 +26,14 @@ class TreeRefresher: public IUpdater { virtual void Update(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, - const std::vector &trees) { + const std::vector &trees) { if (trees.size() == 0) return; // number of threads - int nthread; // thread temporal space std::vector< std::vector > stemp; std::vector fvec_temp; // setup temp space for each thread + int nthread; #pragma omp parallel { nthread = omp_get_num_threads(); @@ -127,8 +127,6 @@ class TreeRefresher: public IUpdater { this->Refresh(gstats, tree[nid].cright(), p_tree); } } - // number of thread in the data - int nthread; // training parameter TrainParam param; }; diff --git a/src/utils/io.h b/src/utils/io.h index 276dd7312..d98b3e4dc 100644 --- a/src/utils/io.h +++ b/src/utils/io.h @@ -100,12 +100,10 @@ class ISeekStream: public IStream { /*! \brief implementation of file i/o stream */ class FileStream : public ISeekStream { public: + explicit FileStream(FILE *fp) : fp(fp) {} explicit FileStream(void) { this->fp = NULL; } - explicit FileStream(FILE *fp) { - this->fp = fp; - } virtual size_t Read(void *ptr, size_t size) { return fread(ptr, size, 1, fp); } diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h index 44a3b8818..b2768b2ea 100644 --- a/src/utils/matrix_csr.h +++ b/src/utils/matrix_csr.h @@ -163,7 +163,7 @@ struct SparseCSRFileBuilder { fo->Write(rptr); // setup buffer space buffer_rptr.resize(rptr.size()); - buffer.reserve(buffer_size); + buffer_temp.reserve(buffer_size); buffer_data.resize(buffer_size); saved_offset.clear(); saved_offset.resize(rptr.size() - 1, 0); diff --git a/src/utils/omp.h b/src/utils/omp.h index 0380ebd67..5eb5612e0 100644 --- a/src/utils/omp.h +++ b/src/utils/omp.h @@ -9,13 +9,8 @@ #include #else #ifndef DISABLE_OPENMP -#ifndef _MSC_VER -#warning "OpenMP is not available, compile to single thread code."\ - "You may want to ungrade your compiler to enable OpenMP support,"\ - "to get benefit of multi-threading." -#else -// TODO add warning for msvc -#endif +// use pragma message instead of warning +#pragma message ("Warning: OpenMP is not available, xgboost will be compiled into single-thread code. Use OpenMP-enabled compiler to get benefit of multi-threading") #endif inline int omp_get_thread_num() { return 0; } inline int omp_get_num_threads() { return 1; } diff --git a/src/utils/random.h b/src/utils/random.h index bf8b04d9d..57e1f243d 100644 --- a/src/utils/random.h +++ b/src/utils/random.h @@ -16,30 +16,21 @@ /*! namespace of PRNG */ namespace xgboost { namespace random { - +#ifndef XGBOOST_CUSTOMIZE_PRNG_ /*! \brief seed the PRNG */ -inline void Seed(uint32_t seed) { +inline void Seed(unsigned seed) { srand(seed); } -/*! \brief return a real number uniform in [0,1) */ -inline double NextDouble(void) { +/*! \brief basic function, uniform */ +inline double Uniform(void) { return static_cast(rand()) / (static_cast(RAND_MAX)+1.0); } /*! \brief return a real numer uniform in (0,1) */ inline double NextDouble2(void) { return (static_cast(rand()) + 1.0) / (static_cast(RAND_MAX)+2.0); } - -/*! \brief return a random number */ -inline uint32_t NextUInt32(void) { - return (uint32_t)rand(); -} -/*! \brief return a random number in n */ -inline uint32_t NextUInt32(uint32_t n) { - return (uint32_t)floor(NextDouble() * n); -} /*! \brief return x~N(0,1) */ -inline double SampleNormal() { +inline double Normal(void) { double x, y, s; do { x = 2 * NextDouble2() - 1.0; @@ -49,22 +40,24 @@ inline double SampleNormal() { return x * sqrt(-2.0 * log(s) / s); } +#else +// include declarations, to be implemented +void Seed(unsigned seed); +double Uniform(void); +double Normal(void); +#endif -/*! \brief return iid x,y ~N(0,1) */ -inline void SampleNormal2D(double &xx, double &yy) { - double x, y, s; - do { - x = 2 * NextDouble2() - 1.0; - y = 2 * NextDouble2() - 1.0; - s = x*x + y*y; - } while (s >= 1.0 || s == 0.0); - double t = sqrt(-2.0 * log(s) / s); - xx = x * t; - yy = y * t; +/*! \brief return a real number uniform in [0,1) */ +inline double NextDouble(void) { + return Uniform(); +} +/*! \brief return a random number in n */ +inline uint32_t NextUInt32(uint32_t n) { + return (uint32_t)floor(NextDouble() * n); } /*! \brief return x~N(mu,sigma^2) */ inline double SampleNormal(double mu, double sigma) { - return SampleNormal() * sigma + mu; + return Normal() * sigma + mu; } /*! \brief return 1 with probability p, coin flip */ inline int SampleBinary(double p) { @@ -90,7 +83,7 @@ struct Random{ inline void Seed(unsigned sd) { this->rseed = sd; #if defined(_MSC_VER)||defined(_WIN32) - srand(rseed); + ::xgboost::random::Seed(sd); #endif } /*! \brief return a real number uniform in [0,1) */ @@ -98,8 +91,8 @@ struct Random{ // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe // For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general // todo, replace with another PRNG -#if defined(_MSC_VER)||defined(_WIN32) - return static_cast(rand()) / (static_cast(RAND_MAX) + 1.0); +#if defined(_MSC_VER)||defined(_WIN32)||defined(XGBOOST_STRICT_CXX98_) + return Uniform(); #else return static_cast(rand_r(&rseed)) / (static_cast(RAND_MAX) + 1.0); #endif diff --git a/src/utils/utils.h b/src/utils/utils.h index 501895224..5c3342d8e 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -7,11 +7,18 @@ */ #define _CRT_SECURE_NO_WARNINGS #include -#include #include #include +#include + +#ifndef XGBOOST_STRICT_CXX98_ +#include +#endif + +#if !defined(__GNUC__) +#define fopen64 std::fopen +#endif #ifdef _MSC_VER -#define fopen64 fopen // NOTE: sprintf_s is not equivalent to snprintf, // they are equivalent when success, which is sufficient for our case #define snprintf sprintf_s @@ -19,16 +26,15 @@ #else #ifdef _FILE_OFFSET_BITS #if _FILE_OFFSET_BITS == 32 -#warning "FILE OFFSET BITS defined to be 32 bit" +#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit") #endif #endif -#ifdef __APPLE__ +#ifdef __APPLE__ #define off64_t off_t -#define fopen64 fopen +#define fopen64 std::fopen #endif -#define _FILE_OFFSET_BITS 64 extern "C" { #include } @@ -47,10 +53,11 @@ typedef long int64_t; namespace xgboost { /*! \brief namespace for helper utils of the project */ namespace utils { -/*! \brief error message buffer length */ -const int kErrorBuffer = 1 << 12; -#ifndef XGBOOST_CUSTOMIZE_ERROR_ +/*! \brief error message buffer length */ +const int kPrintBuffer = 1 << 12; + +#ifndef XGBOOST_CUSTOMIZE_MSG_ /*! * \brief handling of Assert error, caused by in-apropriate input * \param msg error message @@ -67,19 +74,50 @@ inline void HandleCheckError(const char *msg) { fprintf(stderr, "%s\n", msg); exit(-1); } +inline void HandlePrint(const char *msg) { + printf("%s", msg); +} #else +#ifndef XGBOOST_STRICT_CXX98_ // include declarations, some one must implement this void HandleAssertError(const char *msg); void HandleCheckError(const char *msg); +void HandlePrint(const char *msg); #endif +#endif +#ifdef XGBOOST_STRICT_CXX98_ +// these function pointers are to be assigned +extern "C" void (*Printf)(const char *fmt, ...); +extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...); +extern "C" void (*Assert)(int exp, const char *fmt, ...); +extern "C" void (*Check)(int exp, const char *fmt, ...); +extern "C" void (*Error)(const char *fmt, ...); +#else +/*! \brief printf, print message to the console */ +inline void Printf(const char *fmt, ...) { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandlePrint(msg.c_str()); +} +/*! \brief portable version of snprintf */ +inline int SPrintf(char *buf, size_t size, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + int ret = vsnprintf(buf, size, fmt, args); + va_end(args); + return ret; +} /*! \brief assert an condition is true, use this to handle debug information */ inline void Assert(bool exp, const char *fmt, ...) { if (!exp) { - std::string msg(kErrorBuffer, '\0'); + std::string msg(kPrintBuffer, '\0'); va_list args; va_start(args, fmt); - vsnprintf(&msg[0], kErrorBuffer, fmt, args); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); va_end(args); HandleAssertError(msg.c_str()); } @@ -88,10 +126,10 @@ inline void Assert(bool exp, const char *fmt, ...) { /*!\brief same as assert, but this is intended to be used as message for user*/ inline void Check(bool exp, const char *fmt, ...) { if (!exp) { - std::string msg(kErrorBuffer, '\0'); + std::string msg(kPrintBuffer, '\0'); va_list args; va_start(args, fmt); - vsnprintf(&msg[0], kErrorBuffer, fmt, args); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); va_end(args); HandleCheckError(msg.c_str()); } @@ -100,14 +138,15 @@ inline void Check(bool exp, const char *fmt, ...) { /*! \brief report error message, same as check */ inline void Error(const char *fmt, ...) { { - std::string msg(kErrorBuffer, '\0'); + std::string msg(kPrintBuffer, '\0'); va_list args; va_start(args, fmt); - vsnprintf(&msg[0], kErrorBuffer, fmt, args); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); va_end(args); HandleCheckError(msg.c_str()); } } +#endif /*! \brief replace fopen, report error when the file open fails */ inline FILE *FopenCheck(const char *fname, const char *flag) { @@ -115,7 +154,25 @@ inline FILE *FopenCheck(const char *fname, const char *flag) { Check(fp != NULL, "can not open file \"%s\"\n", fname); return fp; } - -} // namespace utils +} // namespace utils +// easy utils that can be directly acessed in xgboost +/*! \brief get the beginning address of a vector */ +template +inline T *BeginPtr(std::vector &vec) { + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} +/*! \brief get the beginning address of a vector */ +template +inline const T *BeginPtr(const std::vector &vec) { + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} } // namespace xgboost #endif // XGBOOST_UTILS_UTILS_H_ diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index 13acacda2..75544dd0e 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -50,6 +50,7 @@ class BoostLearnTask{ if (!strcmp("use_buffer", name)) use_buffer = atoi(val); if (!strcmp("num_round", name)) num_round = atoi(val); if (!strcmp("pred_margin", name)) pred_margin = atoi(val); + if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val); if (!strcmp("save_period", name)) save_period = atoi(val); if (!strcmp("eval_train", name)) eval_train = atoi(val); if (!strcmp("task", name)) task = val; @@ -79,6 +80,7 @@ class BoostLearnTask{ save_period = 0; eval_train = 0; pred_margin = 0; + ntree_limit = 0; dump_model_stats = 0; task = "train"; model_in = "NULL"; @@ -186,7 +188,7 @@ class BoostLearnTask{ inline void TaskPred(void) { std::vector preds; if (!silent) printf("start prediction...\n"); - learner.Predict(*data, pred_margin != 0, &preds); + learner.Predict(*data, pred_margin != 0, &preds, ntree_limit); if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); for (size_t i = 0; i < preds.size(); i++) { @@ -217,6 +219,8 @@ class BoostLearnTask{ std::string task; /*! \brief name of predict file */ std::string name_pred; + /*!\brief limit number of trees in prediction */ + int ntree_limit; /*!\brief whether to directly output margin value */ int pred_margin; /*! \brief whether dump statistics along with model */ diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index e4338e0cd..b7bc0ab66 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -365,7 +365,7 @@ class Booster: return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) def eval(self, mat, name = 'eval', it = 0): return self.eval_set( [(mat,name)], it) - def predict(self, data, output_margin=False): + def predict(self, data, output_margin=False, ntree_limit=0): """ predict with data Args: @@ -373,12 +373,14 @@ class Booster: the dmatrix storing the input output_margin: bool whether output raw margin value that is untransformed + + ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees Returns: numpy array of prediction """ length = ctypes.c_ulong() preds = xglib.XGBoosterPredict(self.handle, data.handle, - int(output_margin), ctypes.byref(length)) + int(output_margin), ntree_limit, ctypes.byref(length)) return ctypes2numpy(preds, length.value, 'float32') def save_model(self, fname): """ save model to file diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 70c7e87b0..abb844bce 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -6,10 +6,14 @@ #include #include #include +// include all std functions +using namespace std; + #include "./xgboost_wrapper.h" #include "../src/data.h" #include "../src/learner/learner-inl.hpp" #include "../src/io/io.h" +#include "../src/utils/utils.h" #include "../src/io/simple_dmatrix-inl.hpp" using namespace xgboost; @@ -25,11 +29,11 @@ class Booster: public learner::BoostLearner { this->init_model = false; this->SetCacheData(mats); } - const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) { + inline const float *Pred(const DataMatrix &dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) { this->CheckInitModel(); - this->Predict(dmat, output_margin != 0, &this->preds_); + this->Predict(dmat, output_margin != 0, &this->preds_, ntree_limit); *len = static_cast(this->preds_.size()); - return &this->preds_[0]; + return BeginPtr(this->preds_); } inline void BoostOneIter(const DataMatrix &train, float *grad, float *hess, bst_ulong len) { @@ -57,7 +61,7 @@ class Booster: public learner::BoostLearner { model_dump_cptr[i] = model_dump[i].c_str(); } *len = static_cast(model_dump.size()); - return &model_dump_cptr[0]; + return BeginPtr(model_dump_cptr); } // temporal fields // temporal data to save evaluation dump @@ -174,13 +178,13 @@ extern "C"{ std::vector &vec = static_cast(handle)->info.GetFloatInfo(field); vec.resize(len); - memcpy(&vec[0], info, sizeof(float) * len); + memcpy(BeginPtr(vec), info, sizeof(float) * len); } void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) { std::vector &vec = static_cast(handle)->info.GetUIntInfo(field); vec.resize(len); - memcpy(&vec[0], info, sizeof(unsigned) * len); + memcpy(BeginPtr(vec), info, sizeof(unsigned) * len); } void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len) { DataMatrix *pmat = static_cast(handle); @@ -194,13 +198,13 @@ extern "C"{ const std::vector &vec = static_cast(handle)->info.GetFloatInfo(field); *len = static_cast(vec.size()); - return &vec[0]; + return BeginPtr(vec); } const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) { const std::vector &vec = static_cast(handle)->info.GetUIntInfo(field); *len = static_cast(vec.size()); - return &vec[0]; + return BeginPtr(vec); } bst_ulong XGDMatrixNumRow(const void *handle) { return static_cast(static_cast(handle)->info.num_row()); @@ -249,8 +253,8 @@ extern "C"{ bst->eval_str = bst->EvalOneIter(iter, mats, names); return bst->eval_str.c_str(); } - const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len) { - return static_cast(handle)->Pred(*static_cast(dmat), output_margin, len); + const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) { + return static_cast(handle)->Pred(*static_cast(dmat), output_margin, ntree_limit, len); } void XGBoosterLoadModel(void *handle, const char *fname) { static_cast(handle)->LoadModel(fname); diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 65446aea6..9687ec0a3 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -165,9 +165,11 @@ extern "C" { * \param handle handle * \param dmat data matrix * \param output_margin whether only output raw margin value + * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees + * when the parameter is set to 0, we will use all the trees * \param len used to store length of returning result */ - XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len); + XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len); /*! * \brief load model from existing file * \param handle handle