diff --git a/.gitignore b/.gitignore index c38e16aed..73ae6748e 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,12 @@ R-package.Rproj *.cache* R-package/inst R-package/src +#java +java/xgboost4j/target +java/xgboost4j/tmp +java/xgboost4j-demo/target +java/xgboost4j-demo/data/ +java/xgboost4j-demo/tmp/ +java/xgboost4j-demo/model/ +nb-configuration* +dmlc-core diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..ac4f58154 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,47 @@ +sudo: true + +# Use Build Matrix to do lint and build seperately +env: + matrix: + - TASK=lint LINT_LANG=cpp + - TASK=lint LINT_LANG=python + - TASK=R-package CXX=g++ + - TASK=python-package CXX=g++ + - TASK=java-package CXX=g++ + - TASK=build CXX=g++ + - TASK=build-with-dmlc CXX=g++ + +# dependent apt packages +addons: + apt: + packages: + - doxygen + - libopenmpi-dev + - wget + - libcurl4-openssl-dev + - unzip + - python-numpy + - python-scipy + - python-nose + +before_install: + - git clone https://github.com/dmlc/dmlc-core + - export TRAVIS=dmlc-core/scripts/travis/ + - export PYTHONPATH=${PYTHONPATH}:${PWD}/wrapper + - source ${TRAVIS}/travis_setup_env.sh + +install: + - pip install cpplint pylint --user `whoami` + +script: scripts/travis_script.sh + + +after_failure: + - scripts/travis_after_failure.sh + + +notifications: + email: + on_success: change + on_failure: always + diff --git a/Makefile b/Makefile index e568222c2..a24bea327 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ export CC = gcc export CXX = g++ export MPICXX = mpicxx -export LDFLAGS= -pthread -lm +export LDFLAGS= -pthread -lm export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops +# java include path +export JAVAINCFLAGS = -I${JAVA_HOME}/include -I${JAVA_HOME}/include/linux -I./java ifeq ($(OS), Windows_NT) export CXX = g++ -m64 @@ -10,8 +12,8 @@ ifeq ($(OS), Windows_NT) endif ifeq ($(no_omp),1) - CFLAGS += -DDISABLE_OPENMP -else + CFLAGS += -DDISABLE_OPENMP +else CFLAGS += -fopenmp endif @@ -27,7 +29,7 @@ ifdef dmlc config = $(dmlc)/config.mk else config = $(dmlc)/make/config.mk - endif + endif endif include $(config) include $(dmlc)/make/dmlc.mk @@ -41,7 +43,7 @@ ifndef WITH_FPIC WITH_FPIC = 1 endif ifeq ($(WITH_FPIC), 1) - CFLAGS += -fPIC + CFLAGS += -fPIC endif @@ -53,6 +55,9 @@ else SLIB = wrapper/libxgboostwrapper.so endif +# java lib +JLIB = java/libxgboostjavawrapper.so + # specify tensor path BIN = xgboost MOCKBIN = xgboost.mock @@ -64,7 +69,11 @@ else TARGET = $(BIN) endif -.PHONY: clean all mpi python Rpack +ifndef LINT_LANG + LINT_LANG= "all" +endif + +.PHONY: clean all mpi python Rpack lint all: $(TARGET) mpi: $(MPIBIN) @@ -73,12 +82,15 @@ python: wrapper/libxgboostwrapper.so # now the wrapper takes in two files. io and wrapper part updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h -gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h +gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h -main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h +main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h xgboost: updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC) wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC) +java: java/libxgboostjavawrapper.so +java/libxgboostjavawrapper.so: java/xgboost4j_wrapper.cpp wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC) + # dependency on rabit subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc + cd subtree/rabit;make lib/librabit.a; cd ../.. @@ -89,23 +101,26 @@ subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc + cd subtree/rabit;make lib/librabit_mpi.a; cd ../.. -$(BIN) : - $(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) +$(BIN) : + $(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -$(MOCKBIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) +$(MOCKBIN) : + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) $(SLIB) : $(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS) -$(OBJ) : +$(JLIB) : + $(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(JAVAINCFLAGS) + +$(OBJ) : $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -$(MPIOBJ) : - $(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) +$(MPIOBJ) : + $(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) -$(MPIBIN) : - $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) +$(MPIBIN) : + $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) install: cp -f -r $(BIN) $(INSTALL_PATH) @@ -133,10 +148,23 @@ Rpack: cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars cp xgboost/src/Makevars xgboost/src/Makevars.win # R CMD build --no-build-vignettes xgboost + # R CMD build xgboost + # rm -rf xgboost + # R CMD check --as-cran xgboost*.tar.gz + +Rbuild: + make Rpack R CMD build xgboost rm -rf xgboost + +Rcheck: + make Rbuild R CMD check --as-cran xgboost*.tar.gz +# lint requires dmlc to be in current folder +lint: + dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package + clean: $(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~ cd subtree/rabit; make clean; cd .. diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 4a5d99c7d..f7f6b9192 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -220,7 +220,8 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { stop("nfold must be bigger than 1") } if(is.null(folds)) { - if (exists('objective', where=param) && strtrim(param[['objective']], 5) == 'rank:') { + if (exists('objective', where=param) && is.character(param$objective) && + strtrim(param[['objective']], 5) == 'rank:') { stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n", "\tConsider providing pre-computed CV-folds through the folds parameter.") } @@ -234,7 +235,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { # For classification, need to convert y labels to factor before making the folds, # and then do stratification by factor levels. # For regression, leave y numeric and do stratification by quantiles. - if (exists('objective', where=param)) { + if (exists('objective', where=param) && is.character(param$objective)) { # If 'objective' provided in params, assume that y is a classification label # unless objective is reg:linear if (param[['objective']] != 'reg:linear') y <- factor(y) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index df7fd5648..793d904cd 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -95,152 +95,160 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = prediction = FALSE, showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L, early.stop.round = NULL, maximize = NULL, ...) { - if (typeof(params) != "list") { - stop("xgb.cv: first argument params must be list") - } - if(!is.null(folds)) { - if(class(folds)!="list" | length(folds) < 2) { - stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold") + if (typeof(params) != "list") { + stop("xgb.cv: first argument params must be list") } - nfold <- length(folds) - } - if (nfold <= 1) { - stop("nfold must be bigger than 1") - } - if (is.null(missing)) { - dtrain <- xgb.get.DMatrix(data, label) - } else { - dtrain <- xgb.get.DMatrix(data, label, missing) - } - params <- append(params, list(...)) - params <- append(params, list(silent=1)) - for (mc in metrics) { - params <- append(params, list("eval_metric"=mc)) - } - - # customized objective and evaluation metric interface - if (!is.null(params$objective) && !is.null(obj)) - stop("xgb.cv: cannot assign two different objectives") - if (!is.null(params$objective)) - if (class(params$objective)=='function') { - obj = params$objective - params$objective = NULL + if(!is.null(folds)) { + if(class(folds)!="list" | length(folds) < 2) { + stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold") + } + nfold <- length(folds) } - if (!is.null(params$eval_metric) && !is.null(feval)) - stop("xgb.cv: cannot assign two different evaluation metrics") - if (!is.null(params$eval_metric)) - if (class(params$eval_metric)=='function') { - feval = params$eval_metric - params$eval_metric = NULL + if (nfold <= 1) { + stop("nfold must be bigger than 1") } - - # Early Stopping - if (!is.null(early.stop.round)){ - if (!is.null(feval) && is.null(maximize)) - stop('Please set maximize to note whether the model is maximizing the evaluation or not.') - if (is.null(maximize) && is.null(params$eval_metric)) - stop('Please set maximize to note whether the model is maximizing the evaluation or not.') - if (is.null(maximize)) - { - if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { - maximize = FALSE - } else { - maximize = TRUE - } - } - - if (maximize) { - bestScore = 0 + if (is.null(missing)) { + dtrain <- xgb.get.DMatrix(data, label) } else { - bestScore = Inf + dtrain <- xgb.get.DMatrix(data, label, missing) + } + dot.params = list(...) + nms.params = names(params) + nms.dot.params = names(dot.params) + if (length(intersect(nms.params,nms.dot.params))>0) + stop("Duplicated defined term in parameters. Please check your list of params.") + params <- append(params, dot.params) + params <- append(params, list(silent=1)) + for (mc in metrics) { + params <- append(params, list("eval_metric"=mc)) } - bestInd = 0 - earlyStopflag = FALSE - if (length(metrics)>1) - warning('Only the first metric is used for early stopping process.') - } - - xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) - obj_type = params[['objective']] - mat_pred = FALSE - if (!is.null(obj_type) && obj_type=='multi:softprob') - { - num_class = params[['num_class']] - if (is.null(num_class)) - stop('must set num_class to use softmax') - predictValues <- matrix(0,xgb.numrow(dtrain),num_class) - mat_pred = TRUE - } - else - predictValues <- rep(0,xgb.numrow(dtrain)) - history <- c() - print.every.n = max(as.integer(print.every.n), 1L) - for (i in 1:nrounds) { - msg <- list() - for (k in 1:nfold) { - fd <- xgb_folds[[k]] - succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) - if (i% str_split("\t") %>% .[[1]] - } else { - if (!prediction) { - msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] - } else { - res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction) - if (mat_pred) { - pred_mat = matrix(res[[2]],num_class,length(fd$index)) - predictValues[fd$index,] <- t(pred_mat) - } else { - predictValues[fd$index] <- res[[2]] - } - msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]] + # customized objective and evaluation metric interface + if (!is.null(params$objective) && !is.null(obj)) + stop("xgb.cv: cannot assign two different objectives") + if (!is.null(params$objective)) + if (class(params$objective)=='function') { + obj = params$objective + params[['objective']] = NULL + } + # if (!is.null(params$eval_metric) && !is.null(feval)) + # stop("xgb.cv: cannot assign two different evaluation metrics") + if (!is.null(params$eval_metric)) + if (class(params$eval_metric)=='function') { + feval = params$eval_metric + params[['eval_metric']] = NULL } - } - } - ret <- xgb.cv.aggcv(msg, showsd) - history <- c(history, ret) - if(verbose) - if (0==(i-1L)%%print.every.n) - cat(ret, "\n", sep="") - # early_Stopping + # Early Stopping if (!is.null(early.stop.round)){ - score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2] - score = strsplit(score,'\\+|:')[[1]][[2]] - score = as.numeric(score) - if ((maximize && score>bestScore) || (!maximize && score=early.stop.round) { - earlyStopflag = TRUE - cat('Stopping. Best iteration:',bestInd) - break + if (!is.null(feval) && is.null(maximize)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize) && is.null(params$eval_metric)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize)) + { + if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { + maximize = FALSE + } else { + maximize = TRUE + } } - } + + if (maximize) { + bestScore = 0 + } else { + bestScore = Inf + } + bestInd = 0 + earlyStopflag = FALSE + + if (length(metrics)>1) + warning('Only the first metric is used for early stopping process.') } - } - - colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") - colnamesMean <- paste(colnames, "mean") - if(showsd) colnamesStd <- paste(colnames, "std") - - colnames <- c() - if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) - else colnames <- colnamesMean - - type <- rep(x = "numeric", times = length(colnames)) - dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table - split <- str_split(string = history, pattern = "\t") - - for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)} - - if (prediction) { - return(list(dt = dt,pred = predictValues)) - } - return(dt) + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) + obj_type = params[['objective']] + mat_pred = FALSE + if (!is.null(obj_type) && obj_type=='multi:softprob') + { + num_class = params[['num_class']] + if (is.null(num_class)) + stop('must set num_class to use softmax') + predictValues <- matrix(0,xgb.numrow(dtrain),num_class) + mat_pred = TRUE + } + else + predictValues <- rep(0,xgb.numrow(dtrain)) + history <- c() + print.every.n = max(as.integer(print.every.n), 1L) + for (i in 1:nrounds) { + msg <- list() + for (k in 1:nfold) { + fd <- xgb_folds[[k]] + succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) + msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] + } + ret <- xgb.cv.aggcv(msg, showsd) + history <- c(history, ret) + if(verbose) + if (0==(i-1L)%%print.every.n) + cat(ret, "\n", sep="") + + # early_Stopping + if (!is.null(early.stop.round)){ + score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2] + score = strsplit(score,'\\+|:')[[1]][[2]] + score = as.numeric(score) + if ((maximize && score>bestScore) || (!maximize && score=early.stop.round) { + earlyStopflag = TRUE + cat('Stopping. Best iteration:',bestInd) + break + } + } + } + + } + + if (prediction) { + for (k in 1:nfold) { + fd = xgb_folds[[k]] + if (!is.null(early.stop.round) && earlyStopflag) { + res = xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction) + } else { + res = xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction) + } + if (mat_pred) { + pred_mat = matrix(res[[2]],num_class,length(fd$index)) + predictValues[fd$index,] = t(pred_mat) + } else { + predictValues[fd$index] = res[[2]] + } + } + } + + + colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") + colnamesMean <- paste(colnames, "mean") + if(showsd) colnamesStd <- paste(colnames, "std") + + colnames <- c() + if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) + else colnames <- colnamesMean + + type <- rep(x = "numeric", times = length(colnames)) + dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table + split <- str_split(string = history, pattern = "\t") + + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)} + + if (prediction) { + return(list(dt = dt,pred = predictValues)) + } + return(dt) } # Avoid error messages during CRAN check. diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 3acd2b174..45ce90447 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -137,7 +137,13 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (length(watchlist) != 0 && verbose == 0) { warning('watchlist is provided but verbose=0, no evaluation information will be printed') } - params = append(params, list(...)) + + dot.params = list(...) + nms.params = names(params) + nms.dot.params = names(dot.params) + if (length(intersect(nms.params,nms.dot.params))>0) + stop("Duplicated term in parameters. Please check your list of params.") + params = append(params, dot.params) # customized objective and evaluation metric interface if (!is.null(params$objective) && !is.null(obj)) diff --git a/R-package/README.md b/R-package/README.md index e974e3554..81dabb31c 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -1,6 +1,8 @@ -# R package for xgboost. +R package for xgboost +===================== -## Installation +Installation +------------ For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. @@ -8,8 +10,26 @@ For up-to-date version (which is recommended), please install from github. Windo devtools::install_github('dmlc/xgboost',subdir='R-package') ``` - -## Examples +Examples +-------- * Please visit [walk through example](demo). * See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd). + +Notes +----- + +If you face an issue installing the package using ```devtools::install_github```, something like this (even after updating libxml and RCurl as lot of forums say) - + +``` +devtools::install_github('dmlc/xgboost',subdir='R-package') +Downloading github repo dmlc/xgboost@master +Error in function (type, msg, asError = TRUE) : + Peer certificate cannot be authenticated with given CA certificates +``` +To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) - +``` +1. Clone the current repository and set your workspace to xgboost/R-package/ +2. Run R CMD INSTALL --build . in terminal to get the tarball. +3. Run install.packages('path_to_the_tarball',repo=NULL) in R to install. +``` diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index e3a536cfe..11de17a91 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -1,8 +1,10 @@ require(xgboost) require(Matrix) require(data.table) -if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values. - +if (!require(vcd)) { + install.packages('vcd') #Available in Cran. Used for its dataset with categorical values. + require(vcd) +} # According to its documentation, Xgboost works only on numbers. # Sometimes the dataset we have to work on have categorical data. # A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable. diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index de6ed339f..a8084b206 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -1,9 +1,10 @@ +// Copyright (c) 2014 by Contributors #include #include #include #include #include -#include +#include #include "wrapper/xgboost_wrapper.h" #include "src/utils/utils.h" #include "src/utils/omp.h" @@ -34,7 +35,7 @@ bool CheckNAN(double v) { bool LogGamma(double v) { return lgammafn(v); } -} // namespace utils +} // namespace utils namespace random { void Seed(unsigned seed) { @@ -58,25 +59,30 @@ inline void _WrapperEnd(void) { PutRNGstate(); } +// do nothing, check error +inline void CheckErr(int ret) { +} + extern "C" { SEXP XGCheckNullPtr_R(SEXP handle) { return ScalarLogical(R_ExternalPtrAddr(handle) == NULL); } - void _DMatrixFinalizer(SEXP ext) { + void _DMatrixFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; XGDMatrixFree(R_ExternalPtrAddr(ext)); R_ClearExternalPtr(ext); } SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { _WrapperBegin(); - void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent)); + DMatrixHandle handle; + CheckErr(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle)); _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); return ret; } - SEXP XGDMatrixCreateFromMat_R(SEXP mat, + SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing) { _WrapperBegin(); SEXP dim = getAttrib(mat, R_DimSymbol); @@ -90,12 +96,13 @@ extern "C" { data[i * ncol +j] = din[i + nrow * j]; } } - void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing)); + DMatrixHandle handle; + CheckErr(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle)); _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - return ret; + return ret; } SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, @@ -118,8 +125,10 @@ extern "C" { indices_[i] = static_cast(p_indices[i]); data_[i] = static_cast(p_data[i]); } - void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), - BeginPtr(data_), nindptr, ndata); + DMatrixHandle handle; + CheckErr(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), + BeginPtr(data_), nindptr, ndata, + &handle)); _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); @@ -133,17 +142,20 @@ extern "C" { for (int i = 0; i < len; ++i) { idxvec[i] = INTEGER(idxset)[i] - 1; } - void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len); + DMatrixHandle res; + CheckErr(XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), + BeginPtr(idxvec), len, + &res)); _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - return ret; + return ret; } void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { _WrapperBegin(); - XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), - CHAR(asChar(fname)), asInteger(silent)); + CheckErr(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), + CHAR(asChar(fname)), asInteger(silent))); _WrapperEnd(); } void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) { @@ -152,28 +164,31 @@ extern "C" { const char *name = CHAR(asChar(field)); if (!strcmp("group", name)) { std::vector vec(len); - #pragma omp parallel for schedule(static) + #pragma omp parallel for schedule(static) for (int i = 0; i < len; ++i) { vec[i] = static_cast(INTEGER(array)[i]); } - XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len); + CheckErr(XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len)); } else { std::vector vec(len); #pragma omp parallel for schedule(static) for (int i = 0; i < len; ++i) { vec[i] = REAL(array)[i]; } - XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), - CHAR(asChar(field)), - BeginPtr(vec), len); + CheckErr(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), + CHAR(asChar(field)), + BeginPtr(vec), len)); } _WrapperEnd(); } SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) { _WrapperBegin(); bst_ulong olen; - const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), - CHAR(asChar(field)), &olen); + const float *res; + CheckErr(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), + CHAR(asChar(field)), + &olen, + &res)); _WrapperEnd(); SEXP ret = PROTECT(allocVector(REALSXP, olen)); for (size_t i = 0; i < olen; ++i) { @@ -183,23 +198,25 @@ extern "C" { return ret; } SEXP XGDMatrixNumRow_R(SEXP handle) { - bst_ulong nrow = XGDMatrixNumRow(R_ExternalPtrAddr(handle)); + bst_ulong nrow; + CheckErr(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow)); return ScalarInteger(static_cast(nrow)); } // functions related to booster - void _BoosterFinalizer(SEXP ext) { + void _BoosterFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; - XGBoosterFree(R_ExternalPtrAddr(ext)); + CheckErr(XGBoosterFree(R_ExternalPtrAddr(ext))); R_ClearExternalPtr(ext); } SEXP XGBoosterCreate_R(SEXP dmats) { _WrapperBegin(); int len = length(dmats); std::vector dvec; - for (int i = 0; i < len; ++i){ + for (int i = 0; i < len; ++i) { dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i))); } - void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size()); + BoosterHandle handle; + CheckErr(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle)); _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); @@ -208,16 +225,16 @@ extern "C" { } void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { _WrapperBegin(); - XGBoosterSetParam(R_ExternalPtrAddr(handle), - CHAR(asChar(name)), - CHAR(asChar(val))); + CheckErr(XGBoosterSetParam(R_ExternalPtrAddr(handle), + CHAR(asChar(name)), + CHAR(asChar(val)))); _WrapperEnd(); } void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) { _WrapperBegin(); - XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), - asInteger(iter), - R_ExternalPtrAddr(dtrain)); + CheckErr(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), + asInteger(iter), + R_ExternalPtrAddr(dtrain))); _WrapperEnd(); } void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) { @@ -230,9 +247,10 @@ extern "C" { tgrad[j] = REAL(grad)[j]; thess[j] = REAL(hess)[j]; } - XGBoosterBoostOneIter(R_ExternalPtrAddr(handle), - R_ExternalPtrAddr(dtrain), - BeginPtr(tgrad), BeginPtr(thess), len); + CheckErr(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle), + R_ExternalPtrAddr(dtrain), + BeginPtr(tgrad), BeginPtr(thess), + len)); _WrapperEnd(); } SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) { @@ -249,21 +267,24 @@ extern "C" { for (int i = 0; i < len; ++i) { vec_sptr.push_back(vec_names[i].c_str()); } - const char *ret = - XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), - asInteger(iter), - BeginPtr(vec_dmats), BeginPtr(vec_sptr), len); + const char *ret; + CheckErr(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), + asInteger(iter), + BeginPtr(vec_dmats), + BeginPtr(vec_sptr), + len, &ret)); _WrapperEnd(); return mkString(ret); } SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) { _WrapperBegin(); bst_ulong olen; - const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle), - R_ExternalPtrAddr(dmat), - asInteger(option_mask), - asInteger(ntree_limit), - &olen); + const float *res; + CheckErr(XGBoosterPredict(R_ExternalPtrAddr(handle), + R_ExternalPtrAddr(dmat), + asInteger(option_mask), + asInteger(ntree_limit), + &olen, &res)); _WrapperEnd(); SEXP ret = PROTECT(allocVector(REALSXP, olen)); for (size_t i = 0; i < olen; ++i) { @@ -274,15 +295,15 @@ extern "C" { } void XGBoosterLoadModel_R(SEXP handle, SEXP fname) { _WrapperBegin(); - XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); + CheckErr(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); _WrapperEnd(); } void XGBoosterSaveModel_R(SEXP handle, SEXP fname) { _WrapperBegin(); - XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); + CheckErr(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); _WrapperEnd(); } - void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) { + void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) { _WrapperBegin(); XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle), RAW(raw), @@ -292,28 +313,29 @@ extern "C" { SEXP XGBoosterModelToRaw_R(SEXP handle) { bst_ulong olen; _WrapperBegin(); - const char *raw = XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen); + const char *raw; + CheckErr(XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen, &raw)); _WrapperEnd(); SEXP ret = PROTECT(allocVector(RAWSXP, olen)); if (olen != 0) { memcpy(RAW(ret), raw, olen); } - UNPROTECT(1); + UNPROTECT(1); return ret; } SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { _WrapperBegin(); bst_ulong olen; - const char **res = - XGBoosterDumpModel(R_ExternalPtrAddr(handle), - CHAR(asChar(fmap)), - asInteger(with_stats), - &olen); + const char **res; + CheckErr(XGBoosterDumpModel(R_ExternalPtrAddr(handle), + CHAR(asChar(fmap)), + asInteger(with_stats), + &olen, &res)); _WrapperEnd(); - SEXP out = PROTECT(allocVector(STRSXP, olen)); - for (size_t i = 0; i < olen; ++i) { + SEXP out = PROTECT(allocVector(STRSXP, olen)); + for (size_t i = 0; i < olen; ++i) { stringstream stream; - stream << "booster["< #include @@ -19,7 +21,7 @@ extern "C" { */ SEXP XGCheckNullPtr_R(SEXP handle); /*! - * \brief load a data matrix + * \brief load a data matrix * \param fname name of the content * \param silent whether print messages * \return a loaded data matrix @@ -32,9 +34,9 @@ extern "C" { * \param missing which value to represent missing value * \return created dmatrix */ - SEXP XGDMatrixCreateFromMat_R(SEXP mat, + SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing); - /*! + /*! * \brief create a matrix content from CSC format * \param indptr pointer to column headers * \param indices row indices @@ -70,26 +72,26 @@ extern "C" { * \param handle a instance of data matrix * \param field field name * \return info vector - */ + */ SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field); /*! * \brief return number of rows * \param handle a instance of data matrix */ SEXP XGDMatrixNumRow_R(SEXP handle); - /*! - * \brief create xgboost learner + /*! + * \brief create xgboost learner * \param dmats a list of dmatrix handles that will be cached - */ + */ SEXP XGBoosterCreate_R(SEXP dmats); - /*! - * \brief set parameters + /*! + * \brief set parameters * \param handle handle * \param name parameter name * \param val value of parameter */ void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val); - /*! + /*! * \brief update the model in one round using dtrain * \param handle handle * \param iter current iteration rounds @@ -132,12 +134,12 @@ extern "C" { * \brief save model into existing file * \param handle handle * \param fname file name - */ + */ void XGBoosterSaveModel_R(SEXP handle, SEXP fname); /*! * \brief load model from raw array * \param handle handle - */ + */ void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw); /*! * \brief save model into R's raw array @@ -153,4 +155,4 @@ extern "C" { */ SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats); } -#endif // XGBOOST_WRAPPER_R_H_ +#endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*) diff --git a/R-package/src/xgboost_assert.c b/R-package/src/xgboost_assert.c index 20b789492..072074243 100644 --- a/R-package/src/xgboost_assert.c +++ b/R-package/src/xgboost_assert.c @@ -1,3 +1,4 @@ +// Copyright (c) 2014 by Contributors #include #include #include @@ -6,17 +7,17 @@ void XGBoostAssert_R(int exp, const char *fmt, ...) { char buf[1024]; if (exp == 0) { - va_list args; + va_list args; va_start(args, fmt); vsprintf(buf, fmt, args); va_end(args); error("AssertError:%s\n", buf); - } + } } void XGBoostCheck_R(int exp, const char *fmt, ...) { char buf[1024]; if (exp == 0) { - va_list args; + va_list args; va_start(args, fmt); vsprintf(buf, fmt, args); va_end(args); @@ -25,7 +26,7 @@ void XGBoostCheck_R(int exp, const char *fmt, ...) { } int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) { int ret; - va_list args; + va_list args; va_start(args, fmt); ret = vsnprintf(buf, size, fmt, args); va_end(args); diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index b7648340d..39ab819f7 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -337,6 +337,17 @@ err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) print(paste("test-error=", err)) ``` +View feature importance/influence from the learnt model +------------------------------------------------------- + +Feature importance is similar to R gbm package's relative influence (rel.inf). + +``` +importance_matrix <- xgb.importance(model = bst) +print(importance_matrix) +xgb.plot.importance(importance_matrix) +``` + View the trees from a model --------------------------- @@ -346,6 +357,12 @@ You can dump the tree you learned using `xgb.dump` into a text file. xgb.dump(bst, with.stats = T) ``` +You can plot the trees from your model using ```xgb.plot.tree`` + +``` +xgb.plot.tree(model = bst) +``` + > if you provide a path to `fname` parameter you can save the trees to your hard drive. Save and load models diff --git a/README.md b/README.md index 415bf771b..e6f5d69d1 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,14 @@ -XGBoost: eXtreme Gradient Boosting +DMLC/XGBoost ================================== +[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) + An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. -It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data +It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data Contributors: https://github.com/dmlc/xgboost/graphs/contributors -Documentations: [Documentation of xgboost](doc/README.md) +Documentations: [Documentation of dmlc/xgboost](doc/README.md) Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion) @@ -24,11 +26,19 @@ XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) What's New ========== +* XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) + - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) -* XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) +* XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) +Contributing to XGBoost +========= +XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users. +* Checkout [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something. +* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users. + Features ======== * Easily accessible in python, R, Julia, CLI diff --git a/demo/binary_classification/README.md b/demo/binary_classification/README.md index 02c06e550..482666ec4 100644 --- a/demo/binary_classification/README.md +++ b/demo/binary_classification/README.md @@ -147,7 +147,7 @@ Run the command again, we can find the log file becomes ``` The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round. -xgboost also support monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes +xgboost also supports monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes ``` [0] test-error:0.016139 test-negllik:0.029795 trainname-error:0.014433 trainname-negllik:0.027023 [1] test-error:0.000000 test-negllik:0.000000 trainname-error:0.001228 trainname-negllik:0.002457 @@ -162,11 +162,15 @@ If you want to continue boosting from existing model, say 0002.model, use ``` xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function. #### Use Multi-Threading -When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running threads to 10, add ```nthread=10``` to your configuration. +When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running add ```nthread``` parameter to you configuration. +Eg. ```nthread=10``` + +Set nthread to be the number of your real cpu (On Unix, this can be found using ```lscpu```) +Some systems will have ```Thread(s) per core = 2```, for example, a 4 core cpu with 8 threads, in such case set ```nthread=4``` and not 8. #### Additional Notes * What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh? - - By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. When next time you run xgboost, it detects i -Demonstrating how to use XGBoost accomplish binary classification tasks on UCI mushroom dataset http://archive.ics.uci.edu/ml/datasets/Mushroom + - By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. Next time when you run xgboost, it will detects these binary files. + diff --git a/demo/kaggle-otto/understandingXGBoostModel.Rmd b/demo/kaggle-otto/understandingXGBoostModel.Rmd index f0858e2da..6bd64401d 100644 --- a/demo/kaggle-otto/understandingXGBoostModel.Rmd +++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd @@ -45,7 +45,7 @@ dim(train) train[1:6,1:5, with =F] # Test dataset dimensions -dim(train) +dim(test) # Test content test[1:6,1:5, with =F] @@ -228,4 +228,4 @@ There are 4 documents you may also be interested in: * [xgboostPresentation.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd): general presentation * [discoverYourData.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/discoverYourData.Rmd): explaining feature analysus * [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit): use case -* [The Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/): very good book to have a good understanding of the model \ No newline at end of file +* [The Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/): very good book to have a good understanding of the model diff --git a/doc/README.md b/doc/README.md index d9884c682..e8df7d57d 100644 --- a/doc/README.md +++ b/doc/README.md @@ -20,7 +20,8 @@ How to get started Highlight Links ==== This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request. -* [Kaggle Malware Prediction winning solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware) +* [Kaggle CrowdFlower winner's solution by Chenglong Chen](https://github.com/ChenglongChen/Kaggle_CrowdFlower) +* [Kaggle Malware Prediction winner's solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware) * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) * [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit) * Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y) @@ -29,3 +30,7 @@ This section is about blogposts, presentation and videos discussing how to use x Contribution ==== Contribution of documents and use-cases are welcomed! +* This package use Google C++ style +* Check tool of codestyle + - clone https://github.com/dmlc/dmlc-core into root directory + - type ```make lint``` and fix possible errors. diff --git a/doc/build.md b/doc/build.md index a5261b604..7b8ee96aa 100644 --- a/doc/build.md +++ b/doc/build.md @@ -17,13 +17,15 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost 1. Obtain gcc with openmp support by `brew install gcc --without-multilib` **or** clang with openmp by `brew install clang-omp`. The clang one is recommended because the first method requires us compiling gcc inside the machine (more than an hour in mine)! (BTW, `brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.) -2. **if plaing to use clang-omp** in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to +2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to ```C++ #include /* instead of #include */` ``` - to make it work, otherwise the following steps would show `src/tree/../utils/omp.h:9:10: error: 'omp.h' file not found...` + to make it work, otherwise you might get this error + + `src/tree/../utils/omp.h:9:10: error: 'omp.h' file not found...` @@ -41,13 +43,13 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost export CXX = clang-omp++ ``` - Remember to change `header` if using clang-omp. + Remember to change `header` (mentioned in step 2) if using clang-omp. Then `cd xgboost` then `bash build.sh` to compile XGBoost. And go to `wrapper` sub-folder to install python version. 4. Set the `Makevars` file in highest piority for R. - The point is, there are three `Makevars` inside the machine: `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by runing `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!). + The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!). So, **add** or **change** `~/.R/Makevars` to the following lines: diff --git a/java/README.md b/java/README.md new file mode 100644 index 000000000..12cbb4582 --- /dev/null +++ b/java/README.md @@ -0,0 +1,28 @@ +# xgboost4j +this is a java wrapper for xgboost + +the structure of this wrapper is almost the same as the official python wrapper. + +core of this wrapper is two classes: + +* DMatrix: for handling data + +* Booster: for train and predict + +## usage: + please refer to [xgboost4j.md](doc/xgboost4j.md) for more information. + + besides, simple examples could be found in [xgboost4j-demo](xgboost4j-demo/README.md) + + +## build native library + +for windows: open the xgboost.sln in "../windows" folder, you will found the xgboostjavawrapper project, you should do the following steps to build wrapper library: + * Select x64/win32 and Release in build + * (if you have setted `JAVA_HOME` properly in windows environment variables, escape this step) right click on xgboostjavawrapper project -> choose "Properties" -> click on "C/C++" in the window -> change the "Additional Include Directories" to fit your jdk install path. + * rebuild all + * double click "create_wrap.bat" to set library to proper place + +for linux: + * make sure you have installed jdk and `JAVA_HOME` has been setted properly + * run "create_wrap.sh" diff --git a/java/create_wrap.bat b/java/create_wrap.bat new file mode 100644 index 000000000..e7f8603cd --- /dev/null +++ b/java/create_wrap.bat @@ -0,0 +1,20 @@ +echo "move native library" +set libsource=..\windows\x64\Release\xgboostjavawrapper.dll + +if not exist %libsource% ( +goto end +) + +set libfolder=xgboost4j\src\main\resources\lib +set libpath=%libfolder%\xgboostjavawrapper.dll +if not exist %libfolder% (mkdir %libfolder%) +if exist %libpath% (del %libpath%) +move %libsource% %libfolder% +echo complete +pause +exit + +:end + echo "source library not found, please build it first from ..\windows\xgboost.sln" + pause + exit \ No newline at end of file diff --git a/java/create_wrap.sh b/java/create_wrap.sh new file mode 100755 index 000000000..d66e4dbd4 --- /dev/null +++ b/java/create_wrap.sh @@ -0,0 +1,15 @@ +echo "build java wrapper" +cd .. +make java +cd java +echo "move native lib" + +libPath="xgboost4j/src/main/resources/lib" +if [ ! -d "$libPath" ]; then + mkdir -p "$libPath" +fi + +rm -f xgboost4j/src/main/resources/lib/libxgboostjavawrapper.so +mv libxgboostjavawrapper.so xgboost4j/src/main/resources/lib/ + +echo "complete" diff --git a/java/doc/xgboost4j.md b/java/doc/xgboost4j.md new file mode 100644 index 000000000..201b3cc05 --- /dev/null +++ b/java/doc/xgboost4j.md @@ -0,0 +1,156 @@ +xgboost4j : java wrapper for xgboost +==== + +This page will introduce xgboost4j, the java wrapper for xgboost, including: +* [Building](#build-xgboost4j) +* [Data Interface](#data-interface) +* [Setting Parameters](#setting-parameters) +* [Train Model](#training-model) +* [Prediction](#prediction) + += +#### Build xgboost4j +* Build native library +first make sure you have installed jdk and `JAVA_HOME` has been setted properly, then simply run `./create_wrap.sh`. + +* Package xgboost4j +to package xgboost4j, you can run `mvn package` in xgboost4j folder or just use IDE(eclipse/netbeans) to open this maven project and build. + += +#### Data Interface +Like the xgboost python module, xgboost4j use ```DMatrix``` to handle data, libsvm txt format file, sparse matrix in CSR/CSC format, and dense matrix is supported. + +* To import ```DMatrix``` : +```java +import org.dmlc.xgboost4j.DMatrix; +``` + +* To load libsvm text format file, the usage is like : +```java +DMatrix dmat = new DMatrix("train.svm.txt"); +``` + +* To load sparse matrix in CSR/CSC format is a little complicated, the usage is like : +suppose a sparse matrix : +1 0 2 0 +4 0 0 3 +3 1 2 0 + + for CSR format +```java +long[] rowHeaders = new long[] {0,2,4,7}; +float[] data = new float[] {1f,2f,4f,3f,3f,1f,2f}; +int[] colIndex = new int[] {0,2,0,3,0,1,2}; +DMatrix dmat = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR); +``` + + for CSC format +```java +long[] colHeaders = new long[] {0,3,4,6,7}; +float[] data = new float[] {1f,4f,3f,1f,2f,2f,3f}; +int[] rowIndex = new int[] {0,1,2,2,0,2,1}; +DMatrix dmat = new DMatrix(colHeaders, rowIndex, data, DMatrix.SparseType.CSC); +``` + +* To load 3*2 dense matrix, the usage is like : +suppose a matrix : +1 2 +3 4 +5 6 + +```java +float[] data = new float[] {1f,2f,3f,4f,5f,6f}; +int nrow = 3; +int ncol = 2; +float missing = 0.0f; +DMatrix dmat = new Matrix(data, nrow, ncol, missing); +``` + +* To set weight : +```java +float[] weights = new float[] {1f,2f,1f}; +dmat.setWeight(weights); +``` + +#### Setting Parameters +* in xgboost4j any ```Iterable>``` object could be used as parameters. + +* to set parameters, for non-multiple value params, you can simply use entrySet of an Map: +```java +Map paramMap = new HashMap<>() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + put("eval_metric", "logloss"); + } +}; +Iterable> params = paramMap.entrySet(); +``` +* for the situation that multiple values with same param key, List> would be a good choice, e.g. : +```java +List> params = new ArrayList>() { + { + add(new SimpleEntry("eta", 1.0)); + add(new SimpleEntry("max_depth", 2.0)); + add(new SimpleEntry("silent", 1)); + add(new SimpleEntry("objective", "binary:logistic")); + } +}; +``` + +#### Training Model +With parameters and data, you are able to train a booster model. +* Import ```Trainer``` and ```Booster``` : +```java +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.util.Trainer; +``` + +* Training +```java +DMatrix trainMat = new DMatrix("train.svm.txt"); +DMatrix validMat = new DMatrix("valid.svm.txt"); +//specifiy a watchList to see the performance +//any Iterable> object could be used as watchList +List> watchs = new ArrayList<>(); +watchs.add(new SimpleEntry<>("train", trainMat)); +watchs.add(new SimpleEntry<>("test", testMat)); +int round = 2; +Booster booster = Trainer.train(params, trainMat, round, watchs, null, null); +``` + +* Saving model +After training, you can save model and dump it out. +```java +booster.saveModel("model.bin"); +``` + +* Dump Model and Feature Map +```java +booster.dumpModel("modelInfo.txt", false) +//dump with featureMap +booster.dumpModel("modelInfo.txt", "featureMap.txt", false) +``` + +* Load a model +```java +Params param = new Params() { + { + put("silent", 1); + put("nthread", 6); + } +}; +Booster booster = new Booster(param, "model.bin"); +``` + +####Prediction +after training and loading a model, you use it to predict other data, the predict results will be a two-dimension float array (nsample, nclass) ,for predict leaf, it would be (nsample, nclass*ntrees) +```java +DMatrix dtest = new DMatrix("test.svm.txt"); +//predict +float[][] predicts = booster.predict(dtest); +//predict leaf +float[][] leafPredicts = booster.predict(dtest, 0, true); +``` diff --git a/java/xgboost4j-demo/LICENSE b/java/xgboost4j-demo/LICENSE new file mode 100644 index 000000000..9a1673be2 --- /dev/null +++ b/java/xgboost4j-demo/LICENSE @@ -0,0 +1,15 @@ +/* +Copyright (c) 2014 by Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ \ No newline at end of file diff --git a/java/xgboost4j-demo/README.md b/java/xgboost4j-demo/README.md new file mode 100644 index 000000000..c9cb35e4b --- /dev/null +++ b/java/xgboost4j-demo/README.md @@ -0,0 +1,10 @@ +xgboost4j examples +==== +* [Basic walkthrough of wrappers](src/main/java/org/dmlc/xgboost4j/demo/BasicWalkThrough.java) +* [Cutomize loss function, and evaluation metric](src/main/java/org/dmlc/xgboost4j/demo/CustomObjective.java) +* [Boosting from existing prediction](src/main/java/org/dmlc/xgboost4j/demo/BoostFromPrediction.java) +* [Predicting using first n trees](src/main/java/org/dmlc/xgboost4j/demo/PredictFirstNtree.java) +* [Generalized Linear Model](src/main/java/org/dmlc/xgboost4j/demo/GeneralizedLinearModel.java) +* [Cross validation](src/main/java/org/dmlc/xgboost4j/demo/CrossValidation.java) +* [Predicting leaf indices](src/main/java/org/dmlc/xgboost4j/demo/PredictLeafIndices.java) +* [External Memory](src/main/java/org/dmlc/xgboost4j/demo/ExternalMemory.java) diff --git a/java/xgboost4j-demo/pom.xml b/java/xgboost4j-demo/pom.xml new file mode 100644 index 000000000..28c51bc13 --- /dev/null +++ b/java/xgboost4j-demo/pom.xml @@ -0,0 +1,36 @@ + + + 4.0.0 + org.dmlc + xgboost4j-demo + 1.0 + jar + + UTF-8 + 1.7 + 1.7 + + + + org.dmlc + xgboost4j + 1.1 + + + commons-io + commons-io + 2.4 + + + org.apache.commons + commons-lang3 + 3.4 + + + junit + junit + 4.11 + test + + + \ No newline at end of file diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BasicWalkThrough.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BasicWalkThrough.java new file mode 100644 index 000000000..0c6529d2c --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BasicWalkThrough.java @@ -0,0 +1,164 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.io.File; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.AbstractMap; +import java.util.AbstractMap.SimpleEntry; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.demo.util.DataLoader; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * a simple example of java wrapper for xgboost + * @author hzx + */ +public class BasicWalkThrough { + public static boolean checkPredicts(float[][] fPredicts, float[][] sPredicts) { + if(fPredicts.length != sPredicts.length) { + return false; + } + + for(int i=0; i> object would be used as paramters + //e.g. + // Map paramMap = new HashMap() { + // { + // put("eta", 1.0); + // put("max_depth", 2); + // put("silent", 1); + // put("objective", "binary:logistic"); + // } + // }; + // Iterable> param = paramMap.entrySet(); + + //or + // List> param = new ArrayList>() { + // { + // add(new SimpleEntry("eta", 1.0)); + // add(new SimpleEntry("max_depth", 2.0)); + // add(new SimpleEntry("silent", 1)); + // add(new SimpleEntry("objective", "binary:logistic")); + // } + // }; + + //we use a util class Params to handle parameters as example + Iterable> param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + + + + //specify watchList to set evaluation dmats + //note: any Iterable> object would be used as watchList + //e.g. + //an entrySet of Map is good + // Map watchMap = new HashMap<>(); + // watchMap.put("train", trainMat); + // watchMap.put("test", testMat); + // Iterable> watchs = watchMap.entrySet(); + + //we use a List of Entry WatchList as example + List> watchs = new ArrayList<>(); + watchs.add(new SimpleEntry<>("train", trainMat)); + watchs.add(new SimpleEntry<>("test", testMat)); + + //set round + int round = 2; + + //train a boost model + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + + //predict + float[][] predicts = booster.predict(testMat); + + //save model to modelPath + File file = new File("./model"); + if(!file.exists()) { + file.mkdirs(); + } + + String modelPath = "./model/xgb.model"; + booster.saveModel(modelPath); + + //dump model + booster.dumpModel("./model/dump.raw.txt", false); + + //dump model with feature map + booster.dumpModel("./model/dump.nice.txt", "../../demo/data/featmap.txt", false); + + //save dmatrix into binary buffer + testMat.saveBinary("./model/dtest.buffer"); + + //reload model and data + Booster booster2 = new Booster(param, "./model/xgb.model"); + DMatrix testMat2 = new DMatrix("./model/dtest.buffer"); + float[][] predicts2 = booster2.predict(testMat2); + + + //check the two predicts + System.out.println(checkPredicts(predicts, predicts2)); + + System.out.println("start build dmatrix from csr sparse data ..."); + //build dmatrix from CSR Sparse Matrix + DataLoader.CSRSparseData spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); + + DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, DMatrix.SparseType.CSR); + trainMat2.setLabel(spData.labels); + + //specify watchList + List> watchs2 = new ArrayList<>(); + watchs2.add(new SimpleEntry<>("train", trainMat2)); + watchs2.add(new SimpleEntry<>("test", testMat2)); + Booster booster3 = Trainer.train(param, trainMat2, round, watchs2, null, null); + float[][] predicts3 = booster3.predict(testMat2); + + //check predicts + System.out.println(checkPredicts(predicts, predicts3)); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BoostFromPrediction.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BoostFromPrediction.java new file mode 100644 index 000000000..a81da0c59 --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/BoostFromPrediction.java @@ -0,0 +1,67 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * example for start from a initial base prediction + * @author hzx + */ +public class BoostFromPrediction { + public static void main(String[] args) throws XGBoostError { + System.out.println("start running example to start from a initial prediction"); + + // load file from text file, also binary buffer generated by xgboost4j + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //specify parameters + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //train xgboost for 1 round + Booster booster = Trainer.train(param, trainMat, 1, watchs, null, null); + + float[][] trainPred = booster.predict(trainMat, true); + float[][] testPred = booster.predict(testMat, true); + + trainMat.setBaseMargin(trainPred); + testMat.setBaseMargin(testPred); + + System.out.println("result of running from initial prediction"); + Booster booster2 = Trainer.train(param, trainMat, 1, watchs, null, null); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CrossValidation.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CrossValidation.java new file mode 100644 index 000000000..6dcf917da --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CrossValidation.java @@ -0,0 +1,54 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.io.IOException; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * an example of cross validation + * @author hzx + */ +public class CrossValidation { + public static void main(String[] args) throws IOException, XGBoostError { + //load train mat + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + + //set params + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 3); + put("silent", 1); + put("nthread", 6); + put("objective", "binary:logistic"); + put("gamma", 1.0); + put("eval_metric", "error"); + } + }; + + //do 5-fold cross validation + int round = 2; + int nfold = 5; + //set additional eval_metrics + String[] metrics = null; + + String[] evalHist = Trainer.crossValiation(param, trainMat, round, nfold, metrics, null, null); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CustomObjective.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CustomObjective.java new file mode 100644 index 000000000..2b8c44ecd --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/CustomObjective.java @@ -0,0 +1,175 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.IEvaluation; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.IObjective; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * an example user define objective and eval + * NOTE: when you do customized loss function, the default prediction value is margin + * this may make buildin evalution metric not function properly + * for example, we are doing logistic loss, the prediction is score before logistic transformation + * he buildin evaluation error assumes input is after logistic transformation + * Take this in mind when you use the customization, and maybe you need write customized evaluation function + * @author hzx + */ +public class CustomObjective { + /** + * loglikelihoode loss obj function + */ + public static class LogRegObj implements IObjective { + private static final Log logger = LogFactory.getLog(LogRegObj.class); + + /** + * simple sigmoid func + * @param input + * @return + * Note: this func is not concern about numerical stability, only used as example + */ + public float sigmoid(float input) { + float val = (float) (1/(1+Math.exp(-input))); + return val; + } + + public float[][] transform(float[][] predicts) { + int nrow = predicts.length; + float[][] transPredicts = new float[nrow][1]; + + for(int i=0; i getGradient(float[][] predicts, DMatrix dtrain) { + int nrow = predicts.length; + List gradients = new ArrayList<>(); + float[] labels; + try { + labels = dtrain.getLabel(); + } catch (XGBoostError ex) { + logger.error(ex); + return null; + } + float[] grad = new float[nrow]; + float[] hess = new float[nrow]; + + float[][] transPredicts = transform(predicts); + + for(int i=0; i0) { + error++; + } + else if(labels[i]==1f && predicts[i][0]<=0) { + error++; + } + } + + return error/labels.length; + } + } + + public static void main(String[] args) throws XGBoostError { + //load train mat (svmlight format) + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + //load valid mat (svmlight format) + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //set params + //set params + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + } + }; + + //set round + int round = 2; + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //user define obj and eval + IObjective obj = new LogRegObj(); + IEvaluation eval = new EvalError(); + + //train a booster + System.out.println("begin to train the booster model"); + Booster booster = Trainer.train(param, trainMat, round, watchs, obj, eval); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/ExternalMemory.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/ExternalMemory.java new file mode 100644 index 000000000..b0a9d27dc --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/ExternalMemory.java @@ -0,0 +1,65 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * simple example for using external memory version + * @author hzx + */ +public class ExternalMemory { + public static void main(String[] args) throws XGBoostError { + //this is the only difference, add a # followed by a cache prefix name + //several cache file with the prefix will be generated + //currently only support convert from libsvm file + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache"); + + //specify parameters + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + + //performance notice: set nthread to be the number of your real cpu + //some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4 + //param.put("nthread", num_real_cpu); + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //set round + int round = 2; + + //train a boost model + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/GeneralizedLinearModel.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/GeneralizedLinearModel.java new file mode 100644 index 000000000..7d3d717bd --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/GeneralizedLinearModel.java @@ -0,0 +1,74 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.demo.util.CustomEval; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * this is an example of fit generalized linear model in xgboost + * basically, we are using linear model, instead of tree for our boosters + * @author hzx + */ +public class GeneralizedLinearModel { + public static void main(String[] args) throws XGBoostError { + // load file from text file, also binary buffer generated by xgboost4j + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //specify parameters + //change booster to gblinear, so that we are fitting a linear model + // alpha is the L1 regularizer + //lambda is the L2 regularizer + //you can also set lambda_bias which is L2 regularizer on the bias term + Params param = new Params() { + { + put("alpha", 0.0001); + put("silent", 1); + put("objective", "binary:logistic"); + put("booster", "gblinear"); + } + }; + //normally, you do not need to set eta (step_size) + //XGBoost uses a parallel coordinate descent algorithm (shotgun), + //there could be affection on convergence with parallelization on certain cases + //setting eta to be smaller value, e.g 0.5 can make the optimization more stable + //param.put("eta", "0.5"); + + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //train a booster + int round = 4; + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + + float[][] predicts = booster.predict(testMat); + + CustomEval eval = new CustomEval(); + System.out.println("error=" + eval.eval(predicts, testMat)); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictFirstNtree.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictFirstNtree.java new file mode 100644 index 000000000..2bbd1fd6c --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictFirstNtree.java @@ -0,0 +1,69 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.util.Trainer; + +import org.dmlc.xgboost4j.demo.util.CustomEval; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * predict first ntree + * @author hzx + */ +public class PredictFirstNtree { + public static void main(String[] args) throws XGBoostError { + // load file from text file, also binary buffer generated by xgboost4j + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //specify parameters + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //train a booster + int round = 3; + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + + //predict use 1 tree + float[][] predicts1 = booster.predict(testMat, false, 1); + //by default all trees are used to do predict + float[][] predicts2 = booster.predict(testMat); + + //use a simple evaluation class to check error result + CustomEval eval = new CustomEval(); + System.out.println("error of predicts1: " + eval.eval(predicts1, testMat)); + System.out.println("error of predicts2: " + eval.eval(predicts2, testMat)); + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictLeafIndices.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictLeafIndices.java new file mode 100644 index 000000000..ede103aeb --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/PredictLeafIndices.java @@ -0,0 +1,70 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.demo.util.Params; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * predict leaf indices + * @author hzx + */ +public class PredictLeafIndices { + public static void main(String[] args) throws XGBoostError { + // load file from text file, also binary buffer generated by xgboost4j + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //specify parameters + Params param = new Params() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + + //specify watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //train a booster + int round = 3; + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + + //predict using first 2 tree + float[][] leafindex = booster.predict(testMat, 2, true); + for(float[] leafs : leafindex) { + System.out.println(Arrays.toString(leafs)); + } + + //predict all trees + leafindex = booster.predict(testMat, 0, true); + for(float[] leafs : leafindex) { + System.out.println(Arrays.toString(leafs)); + } + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/CustomEval.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/CustomEval.java new file mode 100644 index 000000000..5f25278d5 --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/CustomEval.java @@ -0,0 +1,60 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.IEvaluation; +import org.dmlc.xgboost4j.util.XGBoostError; + +/** + * a util evaluation class for examples + * @author hzx + */ +public class CustomEval implements IEvaluation { + private static final Log logger = LogFactory.getLog(CustomEval.class); + + String evalMetric = "custom_error"; + + @Override + public String getMetric() { + return evalMetric; + } + + @Override + public float eval(float[][] predicts, DMatrix dmat) { + float error = 0f; + float[] labels; + try { + labels = dmat.getLabel(); + } catch (XGBoostError ex) { + logger.error(ex); + return -1f; + } + int nrow = predicts.length; + for(int i=0; i0.5) { + error++; + } + else if(labels[i]==1f && predicts[i][0]<=0.5) { + error++; + } + } + + return error/labels.length; + } +} diff --git a/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/DataLoader.java b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/DataLoader.java new file mode 100644 index 000000000..9bad8b372 --- /dev/null +++ b/java/xgboost4j-demo/src/main/java/org/dmlc/xgboost4j/demo/util/DataLoader.java @@ -0,0 +1,127 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.demo.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.lang3.ArrayUtils; + +/** + * util class for loading data + * @author hzx + */ +public class DataLoader { + public static class DenseData { + public float[] labels; + public float[] data; + public int nrow; + public int ncol; + } + + public static class CSRSparseData { + public float[] labels; + public float[] data; + public long[] rowHeaders; + public int[] colIndex; + } + + public static DenseData loadCSVFile(String filePath) throws FileNotFoundException, UnsupportedEncodingException, IOException { + DenseData denseData = new DenseData(); + + File f = new File(filePath); + FileInputStream in = new FileInputStream(f); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + + denseData.nrow = 0; + denseData.ncol = -1; + String line; + List tlabels = new ArrayList<>(); + List tdata = new ArrayList<>(); + + while((line=reader.readLine()) != null) { + String[] items = line.trim().split(","); + if(items.length==0) { + continue; + } + denseData.nrow++; + if(denseData.ncol == -1) { + denseData.ncol = items.length - 1; + } + + tlabels.add(Float.valueOf(items[items.length-1])); + for(int i=0; i tlabels = new ArrayList<>(); + List tdata = new ArrayList<>(); + List theaders = new ArrayList<>(); + List tindex = new ArrayList<>(); + + File f = new File(filePath); + FileInputStream in = new FileInputStream(f); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + + String line; + long rowheader = 0; + theaders.add(rowheader); + while((line=reader.readLine()) != null) { + String[] items = line.trim().split(" "); + if(items.length==0) { + continue; + } + + rowheader += items.length - 1; + theaders.add(rowheader); + tlabels.add(Float.valueOf(items[0])); + + for(int i=1; i>{ + List> params = new ArrayList<>(); + + /** + * put param key-value pair + * @param key + * @param value + */ + public void put(String key, Object value) { + params.add(new AbstractMap.SimpleEntry<>(key, value)); + } + + @Override + public String toString(){ + String paramsInfo = ""; + for(Entry param : params) { + paramsInfo += param.getKey() + ":" + param.getValue() + "\n"; + } + return paramsInfo; + } + + @Override + public Iterator> iterator() { + return params.iterator(); + } +} diff --git a/java/xgboost4j/LICENSE b/java/xgboost4j/LICENSE new file mode 100644 index 000000000..9a1673be2 --- /dev/null +++ b/java/xgboost4j/LICENSE @@ -0,0 +1,15 @@ +/* +Copyright (c) 2014 by Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ \ No newline at end of file diff --git a/java/xgboost4j/README.md b/java/xgboost4j/README.md new file mode 100644 index 000000000..e46a5b3a0 --- /dev/null +++ b/java/xgboost4j/README.md @@ -0,0 +1,23 @@ +# xgboost4j +this is a java wrapper for xgboost (https://github.com/dmlc/xgboost) +the structure of this wrapper is almost the same as the official python wrapper. +core of this wrapper is two classes: + +* DMatrix: for handling data + +* Booster: for train and predict + +## usage: + + simple examples could be found in test package: + + * Simple Train Example: org.dmlc.xgboost4j.TrainExample.java + + * Simple Predict Example: org.dmlc.xgboost4j.PredictExample.java + + * Cross Validation Example: org.dmlc.xgboost4j.example.CVExample.java + +## native library: + + only 64-bit linux/windows is supported now, if you want to build native wrapper library yourself, please refer to + https://github.com/yanqingmen/xgboost-java, and put your native library to the "./src/main/resources/lib" folder and replace the originals. (either "libxgboostjavawrapper.so" for linux or "xgboostjavawrapper.dll" for windows) diff --git a/java/xgboost4j/pom.xml b/java/xgboost4j/pom.xml new file mode 100644 index 000000000..5e312bf4f --- /dev/null +++ b/java/xgboost4j/pom.xml @@ -0,0 +1,35 @@ + + + 4.0.0 + org.dmlc + xgboost4j + 1.1 + jar + + UTF-8 + 1.7 + 1.7 + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.3 + + + + + + junit + junit + 4.11 + test + + + commons-logging + commons-logging + 1.2 + + + \ No newline at end of file diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/Booster.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/Booster.java new file mode 100644 index 000000000..c2058ceaa --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/Booster.java @@ -0,0 +1,484 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.dmlc.xgboost4j.util.Initializer; +import org.dmlc.xgboost4j.util.ErrorHandle; +import org.dmlc.xgboost4j.util.XGBoostError; +import org.dmlc.xgboost4j.wrapper.XgboostJNI; + + +/** + * Booster for xgboost, similar to the python wrapper xgboost.py + * but custom obj function and eval function not supported at present. + * @author hzx + */ +public final class Booster { + private static final Log logger = LogFactory.getLog(Booster.class); + + long handle = 0; + + //load native library + static { + try { + Initializer.InitXgboost(); + } catch (IOException ex) { + logger.error("load native library failed."); + logger.error(ex); + } + } + + /** + * init Booster from dMatrixs + * @param params parameters + * @param dMatrixs DMatrix array + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public Booster(Iterable> params, DMatrix[] dMatrixs) throws XGBoostError { + init(dMatrixs); + setParam("seed","0"); + setParams(params); + } + + + + /** + * load model from modelPath + * @param params parameters + * @param modelPath booster modelPath (model generated by booster.saveModel) + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public Booster(Iterable> params, String modelPath) throws XGBoostError { + init(null); + if(modelPath == null) { + throw new NullPointerException("modelPath : null"); + } + loadModel(modelPath); + setParam("seed","0"); + setParams(params); + } + + + + + private void init(DMatrix[] dMatrixs) throws XGBoostError { + long[] handles = null; + if(dMatrixs != null) { + handles = dMatrixs2handles(dMatrixs); + } + long[] out = new long[1]; + ErrorHandle.checkCall(XgboostJNI.XGBoosterCreate(handles, out)); + + handle = out[0]; + } + + /** + * set parameter + * @param key param name + * @param value param value + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public final void setParam(String key, String value) throws XGBoostError { + ErrorHandle.checkCall(XgboostJNI.XGBoosterSetParam(handle, key, value)); + } + + /** + * set parameters + * @param params parameters key-value map + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void setParams(Iterable> params) throws XGBoostError { + if(params!=null) { + for(Map.Entry entry : params) { + setParam(entry.getKey(), entry.getValue().toString()); + } + } + } + + + /** + * Update (one iteration) + * @param dtrain training data + * @param iter current iteration number + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void update(DMatrix dtrain, int iter) throws XGBoostError { + ErrorHandle.checkCall(XgboostJNI.XGBoosterUpdateOneIter(handle, iter, dtrain.getHandle())); + } + + /** + * update with customize obj func + * @param dtrain training data + * @param iter current iteration number + * @param obj customized objective class + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void update(DMatrix dtrain, int iter, IObjective obj) throws XGBoostError { + float[][] predicts = predict(dtrain, true); + List gradients = obj.getGradient(predicts, dtrain); + boost(dtrain, gradients.get(0), gradients.get(1)); + } + + /** + * update with give grad and hess + * @param dtrain training data + * @param grad first order of gradient + * @param hess seconde order of gradient + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void boost(DMatrix dtrain, float[] grad, float[] hess) throws XGBoostError { + if(grad.length != hess.length) { + throw new AssertionError(String.format("grad/hess length mismatch %s / %s", grad.length, hess.length)); + } + ErrorHandle.checkCall(XgboostJNI.XGBoosterBoostOneIter(handle, dtrain.getHandle(), grad, hess)); + } + + /** + * evaluate with given dmatrixs. + * @param evalMatrixs dmatrixs for evaluation + * @param evalNames name for eval dmatrixs, used for check results + * @param iter current eval iteration + * @return eval information + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter) throws XGBoostError { + long[] handles = dMatrixs2handles(evalMatrixs); + String[] evalInfo = new String[1]; + ErrorHandle.checkCall(XgboostJNI.XGBoosterEvalOneIter(handle, iter, handles, evalNames, evalInfo)); + return evalInfo[0]; + } + + /** + * evaluate with given customized Evaluation class + * @param evalMatrixs + * @param evalNames + * @param iter + * @param eval + * @return eval information + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter, IEvaluation eval) throws XGBoostError { + String evalInfo = ""; + for(int i=0; i getFeatureScore() throws XGBoostError { + String[] modelInfos = getDumpInfo(false); + Map featureScore = new HashMap<>(); + for(String tree : modelInfos) { + for(String node : tree.split("\n")) { + String[] array = node.split("\\["); + if(array.length == 1) { + continue; + } + String fid = array[1].split("\\]")[0]; + fid = fid.split("<")[0]; + if(featureScore.containsKey(fid)) { + featureScore.put(fid, 1 + featureScore.get(fid)); + } + else { + featureScore.put(fid, 1); + } + } + } + return featureScore; + } + + + /** + * get importance of each feature + * @param featureMap file to save dumped model info + * @return featureMap key: feature index, value: feature importance score + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public Map getFeatureScore(String featureMap) throws XGBoostError { + String[] modelInfos = getDumpInfo(featureMap, false); + Map featureScore = new HashMap<>(); + for(String tree : modelInfos) { + for(String node : tree.split("\n")) { + String[] array = node.split("\\["); + if(array.length == 1) { + continue; + } + String fid = array[1].split("\\]")[0]; + fid = fid.split("<")[0]; + if(featureScore.containsKey(fid)) { + featureScore.put(fid, 1 + featureScore.get(fid)); + } + else { + featureScore.put(fid, 1); + } + } + } + return featureScore; + } + + /** + * transfer DMatrix array to handle array (used for native functions) + * @param dmatrixs + * @return handle array for input dmatrixs + */ + private static long[] dMatrixs2handles(DMatrix[] dmatrixs) { + long[] handles = new long[dmatrixs.length]; + for(int i=0; i getGradient(float[][] predicts, DMatrix dtrain); +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/CVPack.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/CVPack.java new file mode 100644 index 000000000..a9b932f0d --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/CVPack.java @@ -0,0 +1,89 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +import java.util.Map; +import org.dmlc.xgboost4j.IEvaluation; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.IObjective; + +/** + * cross validation package for xgb + * @author hzx + */ +public class CVPack { + DMatrix dtrain; + DMatrix dtest; + DMatrix[] dmats; + String[] names; + Booster booster; + + /** + * create an cross validation package + * @param dtrain train data + * @param dtest test data + * @param params parameters + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public CVPack(DMatrix dtrain, DMatrix dtest, Iterable> params) throws XGBoostError { + dmats = new DMatrix[] {dtrain, dtest}; + booster = new Booster(params, dmats); + names = new String[] {"train", "test"}; + this.dtrain = dtrain; + this.dtest = dtest; + } + + /** + * update one iteration + * @param iter iteration num + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void update(int iter) throws XGBoostError { + booster.update(dtrain, iter); + } + + /** + * update one iteration + * @param iter iteration num + * @param obj customized objective + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public void update(int iter, IObjective obj) throws XGBoostError { + booster.update(dtrain, iter, obj); + } + + /** + * evaluation + * @param iter iteration num + * @return + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public String eval(int iter) throws XGBoostError { + return booster.evalSet(dmats, names, iter); + } + + /** + * evaluation + * @param iter iteration num + * @param eval customized eval + * @return + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public String eval(int iter, IEvaluation eval) throws XGBoostError { + return booster.evalSet(dmats, names, iter, eval); + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/ErrorHandle.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/ErrorHandle.java new file mode 100644 index 000000000..688cd2719 --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/ErrorHandle.java @@ -0,0 +1,50 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +import java.io.IOException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dmlc.xgboost4j.wrapper.XgboostJNI; + +/** + * error handle for Xgboost + * @author hzx + */ +public class ErrorHandle { + private static final Log logger = LogFactory.getLog(ErrorHandle.class); + + //load native library + static { + try { + Initializer.InitXgboost(); + } catch (IOException ex) { + logger.error("load native library failed."); + logger.error(ex); + } + } + + /** + * check the return value of C API + * @param ret return valud of xgboostJNI C API call + * @throws org.dmlc.xgboost4j.util.XGBoostError + */ + public static void checkCall(int ret) throws XGBoostError { + if(ret != 0) { + throw new XGBoostError(XgboostJNI.XGBGetLastError()); + } + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Initializer.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Initializer.java new file mode 100644 index 000000000..83932ce84 --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Initializer.java @@ -0,0 +1,92 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +import java.io.IOException; +import java.lang.reflect.Field; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * class to load native library + * @author hzx + */ +public class Initializer { + private static final Log logger = LogFactory.getLog(Initializer.class); + + static boolean initialized = false; + public static final String nativePath = "./lib"; + public static final String nativeResourcePath = "/lib/"; + public static final String[] libNames = new String[] {"xgboostjavawrapper"}; + + public static synchronized void InitXgboost() throws IOException { + if(initialized == false) { + for(String libName: libNames) { + smartLoad(libName); + } + initialized = true; + } + } + + /** + * load native library, this method will first try to load library from java.library.path, then try to load library in jar package. + * @param libName + * @throws IOException + */ + private static void smartLoad(String libName) throws IOException { + addNativeDir(nativePath); + try { + System.loadLibrary(libName); + } + catch (UnsatisfiedLinkError e) { + try { + NativeUtils.loadLibraryFromJar(nativeResourcePath + System.mapLibraryName(libName)); + } + catch (IOException e1) { + throw e1; + } + } + } + + /** + * add libPath to java.library.path, then native library in libPath would be load properly + * @param libPath + * @throws IOException + */ + public static void addNativeDir(String libPath) throws IOException { + try { + Field field = ClassLoader.class.getDeclaredField("usr_paths"); + field.setAccessible(true); + String[] paths = (String[]) field.get(null); + for (String path : paths) { + if (libPath.equals(path)) { + return; + } + } + String[] tmp = new String[paths.length+1]; + System.arraycopy(paths,0,tmp,0,paths.length); + tmp[paths.length] = libPath; + field.set(null, tmp); + } catch (IllegalAccessException e) { + logger.error(e.getMessage()); + throw new IOException("Failed to get permissions to set library path"); + } catch (NoSuchFieldException e) { + logger.error(e.getMessage()); + throw new IOException("Failed to get field handle to set library path"); + } + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/NativeUtils.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/NativeUtils.java new file mode 100644 index 000000000..c0f199005 --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/NativeUtils.java @@ -0,0 +1,109 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + + +/** + * Simple library class for working with JNI (Java Native Interface) + * + * @see http://adamheinrich.com/2012/how-to-load-native-jni-library-from-jar + * + * @author Adam Heirnich <adam@adamh.cz>, http://www.adamh.cz + */ +public class NativeUtils { + + /** + * Private constructor - this class will never be instanced + */ + private NativeUtils() { + } + + /** + * Loads library from current JAR archive + * + * The file from JAR is copied into system temporary directory and then loaded. The temporary file is deleted after exiting. + * Method uses String as filename because the pathname is "abstract", not system-dependent. + * + * @param path The filename inside JAR as absolute path (beginning with '/'), e.g. /package/File.ext + * @throws IOException If temporary file creation or read/write operation fails + * @throws IllegalArgumentException If source file (param path) does not exist + * @throws IllegalArgumentException If the path is not absolute or if the filename is shorter than three characters (restriction of {@see File#createTempFile(java.lang.String, java.lang.String)}). + */ + public static void loadLibraryFromJar(String path) throws IOException { + + if (!path.startsWith("/")) { + throw new IllegalArgumentException("The path has to be absolute (start with '/')."); + } + + // Obtain filename from path + String[] parts = path.split("/"); + String filename = (parts.length > 1) ? parts[parts.length - 1] : null; + + // Split filename to prexif and suffix (extension) + String prefix = ""; + String suffix = null; + if (filename != null) { + parts = filename.split("\\.", 2); + prefix = parts[0]; + suffix = (parts.length > 1) ? "."+parts[parts.length - 1] : null; // Thanks, davs! :-) + } + + // Check if the filename is okay + if (filename == null || prefix.length() < 3) { + throw new IllegalArgumentException("The filename has to be at least 3 characters long."); + } + + // Prepare temporary file + File temp = File.createTempFile(prefix, suffix); + temp.deleteOnExit(); + + if (!temp.exists()) { + throw new FileNotFoundException("File " + temp.getAbsolutePath() + " does not exist."); + } + + // Prepare buffer for data copying + byte[] buffer = new byte[1024]; + int readBytes; + + // Open and check input stream + InputStream is = NativeUtils.class.getResourceAsStream(path); + if (is == null) { + throw new FileNotFoundException("File " + path + " was not found inside JAR."); + } + + // Open output stream and copy data between source file in JAR and the temporary file + OutputStream os = new FileOutputStream(temp); + try { + while ((readBytes = is.read(buffer)) != -1) { + os.write(buffer, 0, readBytes); + } + } finally { + // If read/write fails, close streams safely before throwing an exception + os.close(); + is.close(); + } + + // Finally, load the library + System.load(temp.getAbsolutePath()); + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Trainer.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Trainer.java new file mode 100644 index 000000000..e5ac8502a --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/Trainer.java @@ -0,0 +1,235 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dmlc.xgboost4j.IEvaluation; +import org.dmlc.xgboost4j.Booster; +import org.dmlc.xgboost4j.DMatrix; +import org.dmlc.xgboost4j.IObjective; + + +/** + * trainer for xgboost + * @author hzx + */ +public class Trainer { + private static final Log logger = LogFactory.getLog(Trainer.class); + + /** + * Train a booster with given parameters. + * @param params Booster params. + * @param dtrain Data to be trained. + * @param round Number of boosting iterations. + * @param watchs a group of items to be evaluated during training, this allows user to watch performance on the validation set. + * @param obj customized objective (set to null if not used) + * @param eval customized evaluation (set to null if not used) + * @return trained booster + */ + public static Booster train(Iterable> params, DMatrix dtrain, int round, + Iterable> watchs, IObjective obj, IEvaluation eval) throws XGBoostError { + + //collect eval matrixs + String[] evalNames; + DMatrix[] evalMats; + List names = new ArrayList<>(); + List mats = new ArrayList<>(); + + for(Entry evalEntry : watchs) { + names.add(evalEntry.getKey()); + mats.add(evalEntry.getValue()); + } + + evalNames = names.toArray(new String[names.size()]); + evalMats = mats.toArray(new DMatrix[mats.size()]); + + //collect all data matrixs + DMatrix[] allMats; + if(evalMats!=null && evalMats.length>0) { + allMats = new DMatrix[evalMats.length+1]; + allMats[0] = dtrain; + System.arraycopy(evalMats, 0, allMats, 1, evalMats.length); + } + else { + allMats = new DMatrix[1]; + allMats[0] = dtrain; + } + + //initialize booster + Booster booster = new Booster(params, allMats); + + //begin to train + for(int iter=0; iter0) { + String evalInfo; + if(eval != null) { + evalInfo = booster.evalSet(evalMats, evalNames, iter, eval); + } + else { + evalInfo = booster.evalSet(evalMats, evalNames, iter); + } + logger.info(evalInfo); + } + } + return booster; + } + + /** + * Cross-validation with given paramaters. + * @param params Booster params. + * @param data Data to be trained. + * @param round Number of boosting iterations. + * @param nfold Number of folds in CV. + * @param metrics Evaluation metrics to be watched in CV. + * @param obj customized objective (set to null if not used) + * @param eval customized evaluation (set to null if not used) + * @return evaluation history + */ + public static String[] crossValiation(Iterable> params, DMatrix data, int round, int nfold, String[] metrics, IObjective obj, IEvaluation eval) throws XGBoostError { + CVPack[] cvPacks = makeNFold(data, nfold, params, metrics); + String[] evalHist = new String[round]; + String[] results = new String[cvPacks.length]; + for(int i=0; i> params, String[] evalMetrics) throws XGBoostError { + List samples = genRandPermutationNums(0, (int) data.rowNum()); + int step = samples.size()/nfold; + int[] testSlice = new int[step]; + int[] trainSlice = new int[samples.size()-step]; + int testid, trainid; + CVPack[] cvPacks = new CVPack[nfold]; + for(int i=0; i(i*step) && j<(i*step+step) && testid genRandPermutationNums(int start, int end) { + List samples = new ArrayList<>(); + for(int i=start; i > cvMap = new HashMap<>(); + String aggResult = results[0].split("\t")[0]; + for(String result : results) { + String[] items = result.split("\t"); + for(int i=1; i()); + } + cvMap.get(key).add(value); + } + } + + for(String key : cvMap.keySet()) { + float value = 0f; + for(Float tvalue : cvMap.get(key)) { + value += tvalue; + } + value /= cvMap.get(key).size(); + aggResult += String.format("\tcv-%s:%f", key, value); + } + + return aggResult; + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/XGBoostError.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/XGBoostError.java new file mode 100644 index 000000000..dc7a9a0b2 --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/util/XGBoostError.java @@ -0,0 +1,26 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.util; + +/** + * custom error class for xgboost + * @author hzx + */ +public class XGBoostError extends Exception{ + public XGBoostError(String message) { + super(message); + } +} diff --git a/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/wrapper/XgboostJNI.java b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/wrapper/XgboostJNI.java new file mode 100644 index 000000000..fe181347a --- /dev/null +++ b/java/xgboost4j/src/main/java/org/dmlc/xgboost4j/wrapper/XgboostJNI.java @@ -0,0 +1,50 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j.wrapper; + +/** + * xgboost jni wrapper functions for xgboost_wrapper.h + * change 2015-7-6: *use a long[] (length=1) as container of handle to get the output DMatrix or Booster + * @author hzx + */ +public class XgboostJNI { + public final static native String XGBGetLastError(); + public final static native int XGDMatrixCreateFromFile(String fname, int silent, long[] out); + public final static native int XGDMatrixCreateFromCSR(long[] indptr, int[] indices, float[] data, long[] out); + public final static native int XGDMatrixCreateFromCSC(long[] colptr, int[] indices, float[] data, long[] out); + public final static native int XGDMatrixCreateFromMat(float[] data, int nrow, int ncol, float missing, long[] out); + public final static native int XGDMatrixSliceDMatrix(long handle, int[] idxset, long[] out); + public final static native int XGDMatrixFree(long handle); + public final static native int XGDMatrixSaveBinary(long handle, String fname, int silent); + public final static native int XGDMatrixSetFloatInfo(long handle, String field, float[] array); + public final static native int XGDMatrixSetUIntInfo(long handle, String field, int[] array); + public final static native int XGDMatrixSetGroup(long handle, int[] group); + public final static native int XGDMatrixGetFloatInfo(long handle, String field, float[][] info); + public final static native int XGDMatrixGetUIntInfo(long handle, String filed, int[][] info); + public final static native int XGDMatrixNumRow(long handle, long[] row); + public final static native int XGBoosterCreate(long[] handles, long[] out); + public final static native int XGBoosterFree(long handle); + public final static native int XGBoosterSetParam(long handle, String name, String value); + public final static native int XGBoosterUpdateOneIter(long handle, int iter, long dtrain); + public final static native int XGBoosterBoostOneIter(long handle, long dtrain, float[] grad, float[] hess); + public final static native int XGBoosterEvalOneIter(long handle, int iter, long[] dmats, String[] evnames, String[] eval_info); + public final static native int XGBoosterPredict(long handle, long dmat, int option_mask, long ntree_limit, float[][] predicts); + public final static native int XGBoosterLoadModel(long handle, String fname); + public final static native int XGBoosterSaveModel(long handle, String fname); + public final static native int XGBoosterLoadModelFromBuffer(long handle, long buf, long len); + public final static native int XGBoosterGetModelRaw(long handle, String[] out_string); + public final static native int XGBoosterDumpModel(long handle, String fmap, int with_stats, String[][] out_strings); +} diff --git a/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/BoosterTest.java b/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/BoosterTest.java new file mode 100644 index 000000000..eb022b7e8 --- /dev/null +++ b/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/BoosterTest.java @@ -0,0 +1,108 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import junit.framework.TestCase; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dmlc.xgboost4j.util.Trainer; +import org.dmlc.xgboost4j.util.XGBoostError; +import org.junit.Test; + +/** + * test cases for Booster + * @author hzx + */ +public class BoosterTest { + public static class EvalError implements IEvaluation { + private static final Log logger = LogFactory.getLog(EvalError.class); + + String evalMetric = "custom_error"; + + public EvalError() { + } + + @Override + public String getMetric() { + return evalMetric; + } + + @Override + public float eval(float[][] predicts, DMatrix dmat) { + float error = 0f; + float[] labels; + try { + labels = dmat.getLabel(); + } catch (XGBoostError ex) { + logger.error(ex); + return -1f; + } + int nrow = predicts.length; + for(int i=0; i0) { + error++; + } + else if(labels[i]==1f && predicts[i][0]<=0) { + error++; + } + } + + return error/labels.length; + } + } + + @Test + public void testBoosterBasic() throws XGBoostError { + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + + //set params + Map paramMap = new HashMap() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + } + }; + Iterable> param = paramMap.entrySet(); + + //set watchList + List> watchs = new ArrayList<>(); + watchs.add(new AbstractMap.SimpleEntry<>("train", trainMat)); + watchs.add(new AbstractMap.SimpleEntry<>("test", testMat)); + + //set round + int round = 2; + + //train a boost model + Booster booster = Trainer.train(param, trainMat, round, watchs, null, null); + + //predict raw output + float[][] predicts = booster.predict(testMat, true); + + //eval + IEvaluation eval = new EvalError(); + //error must be less than 0.1 + TestCase.assertTrue(eval.eval(predicts, testMat)<0.1f); + } +} diff --git a/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/DMatrixTest.java b/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/DMatrixTest.java new file mode 100644 index 000000000..343dd3ed9 --- /dev/null +++ b/java/xgboost4j/src/test/java/org/dmlc/xgboost4j/DMatrixTest.java @@ -0,0 +1,102 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package org.dmlc.xgboost4j; + +import java.util.Arrays; +import java.util.Random; +import junit.framework.TestCase; +import org.dmlc.xgboost4j.util.XGBoostError; +import org.junit.Test; + +/** + * test cases for DMatrix + * @author hzx + */ +public class DMatrixTest { + + @Test + public void testCreateFromFile() throws XGBoostError { + //create DMatrix from file + DMatrix dmat = new DMatrix("../../demo/data/agaricus.txt.test"); + //get label + float[] labels = dmat.getLabel(); + //check length + TestCase.assertTrue(dmat.rowNum()==labels.length); + //set weights + float[] weights = Arrays.copyOf(labels, labels.length); + dmat.setWeight(weights); + float[] dweights = dmat.getWeight(); + TestCase.assertTrue(Arrays.equals(weights, dweights)); + } + + @Test + public void testCreateFromCSR() throws XGBoostError { + //create Matrix from csr format sparse Matrix and labels + /** + * sparse matrix + * 1 0 2 3 0 + * 4 0 2 3 5 + * 3 1 2 5 0 + */ + float[] data = new float[] {1, 2, 3, 4, 2, 3, 5, 3, 1, 2, 5}; + int[] colIndex = new int[] {0, 2, 3, 0, 2, 3, 4, 0, 1, 2, 3}; + long[] rowHeaders = new long[] {0, 3, 7, 11}; + DMatrix dmat1 = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR); + //check row num + System.out.println(dmat1.rowNum()); + TestCase.assertTrue(dmat1.rowNum()==3); + //test set label + float[] label1 = new float[] {1, 0, 1}; + dmat1.setLabel(label1); + float[] label2 = dmat1.getLabel(); + TestCase.assertTrue(Arrays.equals(label1, label2)); + } + + @Test + public void testCreateFromDenseMatrix() throws XGBoostError { + //create DMatrix from 10*5 dense matrix + int nrow = 10; + int ncol = 5; + float[] data0 = new float[nrow*ncol]; + //put random nums + Random random = new Random(); + for(int i=0; i +#include "../wrapper/xgboost_wrapper.h" +#include "xgboost4j_wrapper.h" + +JNIEXPORT jstring JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBGetLastError + (JNIEnv *jenv, jclass jcls) { + jstring jresult = 0 ; + char* result = 0; + result = (char *)XGBGetLastError(); + if (result) jresult = jenv->NewStringUTF((const char *)result); + return jresult; +} + +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromFile + (JNIEnv *jenv, jclass jcls, jstring jfname, jint jsilent, jlongArray jout) { + jint jresult = 0 ; + char *fname = (char *) 0 ; + int silent; + void* result[1]; + unsigned long out[1]; + + fname = (char *)jenv->GetStringUTFChars(jfname, 0); + + silent = (int)jsilent; + jresult = (jint) XGDMatrixCreateFromFile((char const *)fname, silent, result); + + + *(void **)&out[0] = *result; + + if (fname) jenv->ReleaseStringUTFChars(jfname, (const char *)fname); + + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromCSR + * Signature: ([J[J[F)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromCSR + (JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jlongArray jout) { + jint jresult = 0 ; + bst_ulong nindptr ; + bst_ulong nelem; + void *result[1]; + unsigned long out[1]; + + jlong* indptr = jenv->GetLongArrayElements(jindptr, 0); + jint* indices = jenv->GetIntArrayElements(jindices, 0); + jfloat* data = jenv->GetFloatArrayElements(jdata, 0); + nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); + nelem = (bst_ulong)jenv->GetArrayLength(jdata); + + jresult = (jint) XGDMatrixCreateFromCSR((unsigned long const *)indptr, (unsigned int const *)indices, (float const *)data, nindptr, nelem, result); + *(void **)&out[0] = *result; + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + + //release + jenv->ReleaseLongArrayElements(jindptr, indptr, 0); + jenv->ReleaseIntArrayElements(jindices, indices, 0); + jenv->ReleaseFloatArrayElements(jdata, data, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromCSC + * Signature: ([J[J[F)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromCSC + (JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jlongArray jout) { + jint jresult = 0; + bst_ulong nindptr ; + bst_ulong nelem; + void *result[1]; + unsigned long out[1]; + + jlong* indptr = jenv->GetLongArrayElements(jindptr, NULL); + jint* indices = jenv->GetIntArrayElements(jindices, 0); + jfloat* data = jenv->GetFloatArrayElements(jdata, NULL); + nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); + nelem = (bst_ulong)jenv->GetArrayLength(jdata); + + jresult = (jint) XGDMatrixCreateFromCSC((unsigned long const *)indptr, (unsigned int const *)indices, (float const *)data, nindptr, nelem, result); + *(void **)&out[0] = *result; + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + + //release + jenv->ReleaseLongArrayElements(jindptr, indptr, 0); + jenv->ReleaseIntArrayElements(jindices, indices, 0); + jenv->ReleaseFloatArrayElements(jdata, data, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromMat + * Signature: ([FIIF)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromMat + (JNIEnv *jenv, jclass jcls, jfloatArray jdata, jint jnrow, jint jncol, jfloat jmiss, jlongArray jout) { + jint jresult = 0 ; + bst_ulong nrow ; + bst_ulong ncol ; + float miss ; + void *result[1]; + unsigned long out[1]; + + + jfloat* data = jenv->GetFloatArrayElements(jdata, 0); + nrow = (bst_ulong)jnrow; + ncol = (bst_ulong)jncol; + miss = (float)jmiss; + + jresult = (jint) XGDMatrixCreateFromMat((float const *)data, nrow, ncol, miss, result); + *(void **)&out[0] = *result; + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + + //release + jenv->ReleaseFloatArrayElements(jdata, data, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSliceDMatrix + * Signature: (J[I)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSliceDMatrix + (JNIEnv *jenv, jclass jcls, jlong jhandle, jintArray jindexset, jlongArray jout) { + jint jresult = 0 ; + void *handle = (void *) 0 ; + bst_ulong len; + void *result[1]; + unsigned long out[1]; + + jint* indexset = jenv->GetIntArrayElements(jindexset, 0); + handle = *(void **)&jhandle; + len = (bst_ulong)jenv->GetArrayLength(jindexset); + + jresult = (jint) XGDMatrixSliceDMatrix(handle, (int const *)indexset, len, result); + *(void **)&out[0] = *result; + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + + //release + jenv->ReleaseIntArrayElements(jindexset, indexset, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixFree + * Signature: (J)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixFree + (JNIEnv *jenv, jclass jcls, jlong jhandle) { + jint jresult = 0; + void *handle = (void *) 0 ; + handle = *(void **)&jhandle; + jresult = (jint) XGDMatrixFree(handle); + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSaveBinary + * Signature: (JLjava/lang/String;I)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSaveBinary + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfname, jint jsilent) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *fname = (char *) 0 ; + int silent ; + handle = *(void **)&jhandle; + fname = 0; + fname = (char *)jenv->GetStringUTFChars(jfname, 0); + + silent = (int)jsilent; + jresult = (jint) XGDMatrixSaveBinary(handle, (char const *)fname, silent); + if (fname) jenv->ReleaseStringUTFChars(jfname, (const char *)fname); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetFloatInfo + * Signature: (JLjava/lang/String;[F)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetFloatInfo + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfield, jfloatArray jarray) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *field = (char *) 0 ; + bst_ulong len; + + + handle = *(void **)&jhandle; + + field = (char *)jenv->GetStringUTFChars(jfield, 0); + + + jfloat* array = jenv->GetFloatArrayElements(jarray, NULL); + len = (bst_ulong)jenv->GetArrayLength(jarray); + jresult = (jint) XGDMatrixSetFloatInfo(handle, (char const *)field, (float const *)array, len); + + //release + if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field); + jenv->ReleaseFloatArrayElements(jarray, array, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetUIntInfo + * Signature: (JLjava/lang/String;[I)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetUIntInfo + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfield, jintArray jarray) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *field = (char *) 0 ; + bst_ulong len ; + handle = *(void **)&jhandle; + field = 0; + field = (char *)jenv->GetStringUTFChars(jfield, 0); + + + jint* array = jenv->GetIntArrayElements(jarray, NULL); + len = (bst_ulong)jenv->GetArrayLength(jarray); + + jresult = (jint) XGDMatrixSetUIntInfo(handle, (char const *)field, (unsigned int const *)array, len); + //release + if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field); + jenv->ReleaseIntArrayElements(jarray, array, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetGroup + * Signature: (J[I)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetGroup + (JNIEnv * jenv, jclass jcls, jlong jhandle, jintArray jarray) { + jint jresult = 0; + void *handle = (void *) 0 ; + bst_ulong len ; + + handle = *(void **)&jhandle; + jint* array = jenv->GetIntArrayElements(jarray, NULL); + len = (bst_ulong)jenv->GetArrayLength(jarray); + + jresult = (jint) XGDMatrixSetGroup(handle, (unsigned int const *)array, len); + + //release + jenv->ReleaseIntArrayElements(jarray, array, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixGetFloatInfo + * Signature: (JLjava/lang/String;)[F + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixGetFloatInfo + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfield, jobjectArray jout) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *field = (char *) 0 ; + bst_ulong len[1]; + *len = 0; + float *result[1]; + + handle = *(void **)&jhandle; + field = 0; + if (jfield) { + field = (char *)jenv->GetStringUTFChars(jfield, 0); + if (!field) return 0; + } + + jresult = (jint) XGDMatrixGetFloatInfo(handle, (char const *)field, len, (const float **) result); + + if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field); + + jsize jlen = (jsize)*len; + jfloatArray jarray = jenv->NewFloatArray(jlen); + jenv->SetFloatArrayRegion(jarray, 0, jlen, (jfloat *) *result); + jenv->SetObjectArrayElement(jout, 0, (jobject) jarray); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixGetUIntInfo + * Signature: (JLjava/lang/String;)[I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixGetUIntInfo + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfield, jobjectArray jout) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *field = (char *) 0 ; + bst_ulong len[1]; + *len = 0; + unsigned int *result[1]; + + handle = *(void **)&jhandle; + field = (char *)jenv->GetStringUTFChars(jfield, 0); + + jresult = (jint) XGDMatrixGetUIntInfo(handle, (char const *)field, len, (const unsigned int **) result); + + if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field); + + jsize jlen = (jsize)*len; + jintArray jarray = jenv->NewIntArray(jlen); + jenv->SetIntArrayRegion(jarray, 0, jlen, (jint *) *result); + jenv->SetObjectArrayElement(jout, 0, jarray); + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixNumRow + * Signature: (J)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixNumRow + (JNIEnv *jenv, jclass jcls, jlong jhandle, jlongArray jout) { + jint jresult = 0 ; + void *handle = (void *) 0 ; + bst_ulong result[1]; + handle = *(void **)&jhandle; + jresult = (jint) XGDMatrixNumRow(handle, result); + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) result); + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterCreate + * Signature: ([J)J + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterCreate + (JNIEnv *jenv, jclass jcls, jlongArray jhandles, jlongArray jout) { + jint jresult = 0; + void **handles = 0; + bst_ulong len = 0; + void *result[1]; + jlong* cjhandles = 0; + unsigned long out[1]; + + if(jhandles) { + len = (bst_ulong)jenv->GetArrayLength(jhandles); + handles = new void*[len]; + //put handle from jhandles to chandles + cjhandles = jenv->GetLongArrayElements(jhandles, 0); + for(bst_ulong i=0; iReleaseLongArrayElements(jhandles, cjhandles, 0); + } + + *(void **)&out[0] = *result; + jenv->SetLongArrayRegion(jout, 0, 1, (const jlong *) out); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterFree + * Signature: (J)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterFree + (JNIEnv *jenv, jclass jcls, jlong jhandle) { + void *handle = (void *) 0 ; + handle = *(void **)&jhandle; + return (jint) XGBoosterFree(handle); +} + + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterSetParam + * Signature: (JLjava/lang/String;Ljava/lang/String;)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterSetParam + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jname, jstring jvalue) { + jint jresult = -1; + void *handle = (void *) 0 ; + char *name = (char *) 0 ; + char *value = (char *) 0 ; + handle = *(void **)&jhandle; + + name = (char *)jenv->GetStringUTFChars(jname, 0); + value = (char *)jenv->GetStringUTFChars(jvalue, 0); + + jresult = (jint) XGBoosterSetParam(handle, (char const *)name, (char const *)value); + if (name) jenv->ReleaseStringUTFChars(jname, (const char *)name); + if (value) jenv->ReleaseStringUTFChars(jvalue, (const char *)value); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterUpdateOneIter + * Signature: (JIJ)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterUpdateOneIter + (JNIEnv *jenv, jclass jcls, jlong jhandle, jint jiter, jlong jdtrain) { + void *handle = (void *) 0 ; + int iter ; + void *dtrain = (void *) 0 ; + handle = *(void **)&jhandle; + iter = (int)jiter; + dtrain = *(void **)&jdtrain; + return (jint) XGBoosterUpdateOneIter(handle, iter, dtrain); +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterBoostOneIter + * Signature: (JJ[F[F)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterBoostOneIter + (JNIEnv *jenv, jclass jcls, jlong jhandle, jlong jdtrain, jfloatArray jgrad, jfloatArray jhess) { + jint jresult = 0; + void *handle = (void *) 0 ; + void *dtrain = (void *) 0 ; + bst_ulong len ; + + handle = *(void **)&jhandle; + dtrain = *(void **)&jdtrain; + jfloat* grad = jenv->GetFloatArrayElements(jgrad, 0); + jfloat* hess = jenv->GetFloatArrayElements(jhess, 0); + len = (bst_ulong)jenv->GetArrayLength(jgrad); + jresult = (jint) XGBoosterBoostOneIter(handle, dtrain, grad, hess, len); + + //release + jenv->ReleaseFloatArrayElements(jgrad, grad, 0); + jenv->ReleaseFloatArrayElements(jhess, hess, 0); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterEvalOneIter + * Signature: (JI[J[Ljava/lang/String;)Ljava/lang/String; + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterEvalOneIter + (JNIEnv *jenv, jclass jcls, jlong jhandle, jint jiter, jlongArray jdmats, jobjectArray jevnames, jobjectArray jout) { + jint jresult = 0 ; + void *handle = (void *) 0 ; + int iter ; + void **dmats = 0; + char **evnames = 0; + bst_ulong len ; + char *result[1]; + + handle = *(void **)&jhandle; + iter = (int)jiter; + len = (bst_ulong)jenv->GetArrayLength(jdmats); + + + if(len > 0) { + dmats = new void*[len]; + evnames = new char*[len]; + } + + //put handle from jhandles to chandles + jlong* cjdmats = jenv->GetLongArrayElements(jdmats, 0); + for(bst_ulong i=0; iGetObjectArrayElement(jevnames, i); + evnames[i] = (char *)jenv->GetStringUTFChars(jevname, 0); + } + + jresult = (jint) XGBoosterEvalOneIter(handle, iter, dmats, (char const *(*))evnames, len, (const char **) result); + + if(len > 0) { + delete[] dmats; + //release string chars + for(bst_ulong i=0; iGetObjectArrayElement(jevnames, i); + jenv->ReleaseStringUTFChars(jevname, (const char*)evnames[i]); + } + delete[] evnames; + jenv->ReleaseLongArrayElements(jdmats, cjdmats, 0); + } + + jstring jinfo = 0; + if (*result) jinfo = jenv->NewStringUTF((const char *) *result); + jenv->SetObjectArrayElement(jout, 0, jinfo); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterPredict + * Signature: (JJIJ)[F + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterPredict + (JNIEnv *jenv, jclass jcls, jlong jhandle, jlong jdmat, jint joption_mask, jlong jntree_limit, jobjectArray jout) { + jint jresult = 0; + void *handle = (void *) 0 ; + void *dmat = (void *) 0 ; + int option_mask ; + unsigned int ntree_limit ; + bst_ulong len[1]; + *len = 0; + float *result[1]; + + handle = *(void **)&jhandle; + dmat = *(void **)&jdmat; + option_mask = (int)joption_mask; + ntree_limit = (unsigned int)jntree_limit; + + jresult = (jint) XGBoosterPredict(handle, dmat, option_mask, ntree_limit, len, (const float **) result); + + jsize jlen = (jsize)*len; + jfloatArray jarray = jenv->NewFloatArray(jlen); + jenv->SetFloatArrayRegion(jarray, 0, jlen, (jfloat *) *result); + jenv->SetObjectArrayElement(jout, 0, jarray); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterLoadModel + * Signature: (JLjava/lang/String;)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterLoadModel + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfname) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *fname = (char *) 0 ; + handle = *(void **)&jhandle; + + fname = (char *)jenv->GetStringUTFChars(jfname, 0); + + + jresult = (jint) XGBoosterLoadModel(handle,(char const *)fname); + if (fname) jenv->ReleaseStringUTFChars(jfname, (const char *)fname); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterSaveModel + * Signature: (JLjava/lang/String;)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterSaveModel + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfname) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *fname = (char *) 0 ; + handle = *(void **)&jhandle; + fname = 0; + fname = (char *)jenv->GetStringUTFChars(jfname, 0); + + jresult = (jint) XGBoosterSaveModel(handle, (char const *)fname); + if (fname) jenv->ReleaseStringUTFChars(jfname, (const char *)fname); + + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterLoadModelFromBuffer + * Signature: (JJJ)V + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterLoadModelFromBuffer + (JNIEnv *jenv, jclass jcls, jlong jhandle, jlong jbuf, jlong jlen) { + void *handle = (void *) 0 ; + void *buf = (void *) 0 ; + bst_ulong len ; + handle = *(void **)&jhandle; + buf = *(void **)&jbuf; + len = (bst_ulong)jlen; + return (jint) XGBoosterLoadModelFromBuffer(handle, (void const *)buf, len); +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterGetModelRaw + * Signature: (J)Ljava/lang/String; + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterGetModelRaw + (JNIEnv * jenv, jclass jcls, jlong jhandle, jobjectArray jout) { + jint jresult = 0 ; + jstring jinfo = 0; + void *handle = (void *) 0 ; + bst_ulong len[1]; + *len = 0; + char *result[1]; + handle = *(void **)&jhandle; + + jresult = (jint)XGBoosterGetModelRaw(handle, len, (const char **) result); + if (*result){ + jinfo = jenv->NewStringUTF((const char *) *result); + jenv->SetObjectArrayElement(jout, 0, jinfo); + } + return jresult; +} + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterDumpModel + * Signature: (JLjava/lang/String;I)[Ljava/lang/String; + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterDumpModel + (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring jfmap, jint jwith_stats, jobjectArray jout) { + jint jresult = 0; + void *handle = (void *) 0 ; + char *fmap = (char *) 0 ; + int with_stats ; + bst_ulong len[1]; + *len = 0; + + char **result[1]; + handle = *(void **)&jhandle; + fmap = 0; + if (jfmap) { + fmap = (char *)jenv->GetStringUTFChars(jfmap, 0); + if (!fmap) return 0; + } + with_stats = (int)jwith_stats; + + jresult = (jint) XGBoosterDumpModel(handle, (const char *)fmap, with_stats, len, (const char ***) result); + + jsize jlen = (jsize)*len; + jobjectArray jinfos = jenv->NewObjectArray(jlen, jenv->FindClass("java/lang/String"), jenv->NewStringUTF("")); + for(int i=0 ; iSetObjectArrayElement(jinfos, i, jenv->NewStringUTF((const char*) result[0][i])); + } + jenv->SetObjectArrayElement(jout, 0, jinfos); + + if (fmap) jenv->ReleaseStringUTFChars(jfmap, (const char *)fmap); + + return jresult; +} \ No newline at end of file diff --git a/java/xgboost4j_wrapper.h b/java/xgboost4j_wrapper.h new file mode 100644 index 000000000..93764ef53 --- /dev/null +++ b/java/xgboost4j_wrapper.h @@ -0,0 +1,221 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_dmlc_xgboost4j_wrapper_XgboostJNI */ + +#ifndef _Included_org_dmlc_xgboost4j_wrapper_XgboostJNI +#define _Included_org_dmlc_xgboost4j_wrapper_XgboostJNI +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBGetLastError + * Signature: ()Ljava/lang/String; + */ +JNIEXPORT jstring JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBGetLastError + (JNIEnv *, jclass); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromFile + * Signature: (Ljava/lang/String;I[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromFile + (JNIEnv *, jclass, jstring, jint, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromCSR + * Signature: ([J[I[F[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromCSR + (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromCSC + * Signature: ([J[I[F[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromCSC + (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixCreateFromMat + * Signature: ([FIIF[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixCreateFromMat + (JNIEnv *, jclass, jfloatArray, jint, jint, jfloat, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSliceDMatrix + * Signature: (J[I[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSliceDMatrix + (JNIEnv *, jclass, jlong, jintArray, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixFree + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixFree + (JNIEnv *, jclass, jlong); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSaveBinary + * Signature: (JLjava/lang/String;I)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSaveBinary + (JNIEnv *, jclass, jlong, jstring, jint); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetFloatInfo + * Signature: (JLjava/lang/String;[F)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetFloatInfo + (JNIEnv *, jclass, jlong, jstring, jfloatArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetUIntInfo + * Signature: (JLjava/lang/String;[I)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetUIntInfo + (JNIEnv *, jclass, jlong, jstring, jintArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixSetGroup + * Signature: (J[I)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixSetGroup + (JNIEnv *, jclass, jlong, jintArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixGetFloatInfo + * Signature: (JLjava/lang/String;[[F)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixGetFloatInfo + (JNIEnv *, jclass, jlong, jstring, jobjectArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixGetUIntInfo + * Signature: (JLjava/lang/String;[[I)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixGetUIntInfo + (JNIEnv *, jclass, jlong, jstring, jobjectArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGDMatrixNumRow + * Signature: (J[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGDMatrixNumRow + (JNIEnv *, jclass, jlong, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterCreate + * Signature: ([J[J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterCreate + (JNIEnv *, jclass, jlongArray, jlongArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterFree + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterFree + (JNIEnv *, jclass, jlong); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterSetParam + * Signature: (JLjava/lang/String;Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterSetParam + (JNIEnv *, jclass, jlong, jstring, jstring); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterUpdateOneIter + * Signature: (JIJ)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterUpdateOneIter + (JNIEnv *, jclass, jlong, jint, jlong); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterBoostOneIter + * Signature: (JJ[F[F)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterBoostOneIter + (JNIEnv *, jclass, jlong, jlong, jfloatArray, jfloatArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterEvalOneIter + * Signature: (JI[J[Ljava/lang/String;[Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterEvalOneIter + (JNIEnv *, jclass, jlong, jint, jlongArray, jobjectArray, jobjectArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterPredict + * Signature: (JJIJ[[F)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterPredict + (JNIEnv *, jclass, jlong, jlong, jint, jlong, jobjectArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterLoadModel + * Signature: (JLjava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterLoadModel + (JNIEnv *, jclass, jlong, jstring); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterSaveModel + * Signature: (JLjava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterSaveModel + (JNIEnv *, jclass, jlong, jstring); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterLoadModelFromBuffer + * Signature: (JJJ)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterLoadModelFromBuffer + (JNIEnv *, jclass, jlong, jlong, jlong); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterGetModelRaw + * Signature: (J[Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterGetModelRaw + (JNIEnv *, jclass, jlong, jobjectArray); + +/* + * Class: org_dmlc_xgboost4j_wrapper_XgboostJNI + * Method: XGBoosterDumpModel + * Signature: (JLjava/lang/String;I[[Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_org_dmlc_xgboost4j_wrapper_XgboostJNI_XGBoosterDumpModel + (JNIEnv *, jclass, jlong, jstring, jint, jobjectArray); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/scripts/travis_R_script.sh b/scripts/travis_R_script.sh new file mode 100755 index 000000000..5a9ea7528 --- /dev/null +++ b/scripts/travis_R_script.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Test R package of xgboost +set -e +export _R_CHECK_TIMINGS_=0 +export R_BUILD_ARGS="--no-build-vignettes --no-manual" +export R_CHECK_ARGS="--no-vignettes --no-manual" + +curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh +chmod 755 ./travis-tool.sh +./travis-tool.sh bootstrap +make Rpack +cd ./xgboost +../travis-tool.sh install_deps +../travis-tool.sh run_tests \ No newline at end of file diff --git a/scripts/travis_after_failure.sh b/scripts/travis_after_failure.sh new file mode 100755 index 000000000..15b74d87f --- /dev/null +++ b/scripts/travis_after_failure.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +if [ ${TASK} == "R-package" ]; then + cat R-package/xgboost.Rcheck/*.log +fi diff --git a/scripts/travis_java_script.sh b/scripts/travis_java_script.sh new file mode 100755 index 000000000..e0583e1fb --- /dev/null +++ b/scripts/travis_java_script.sh @@ -0,0 +1,7 @@ +# Test java package of xgboost +set -e +cd java +./create_wrap.sh +cd xgboost4j +mvn clean install -DskipTests=true +mvn test diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh new file mode 100755 index 000000000..5702d35cd --- /dev/null +++ b/scripts/travis_script.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# main script of travis +if [ ${TASK} == "lint" ]; then + make lint || exit -1 +fi + +if [ ${TASK} == "build" ]; then + make all CXX=${CXX} || exit -1 +fi + +if [ ${TASK} == "build-with-dmlc" ]; then + cd dmlc-core + cp make/config.mk . + echo "USE_S3=1" >> config.mk + make all CXX=${CXX}|| exit -1 + cd .. + make dmlc=dmlc-core CXX=${CXX} || exit -1 +fi + +if [ ${TASK} == "R-package" ]; then + scripts/travis_R_script.sh || exit -1 +fi + +if [ ${TASK} == "python-package" ]; then + make all CXX=${CXX} || exit -1 + nosetests tests/python || exit -1 +fi + +if [ ${TASK} == "java-package" ]; then + make java CXX=${CXX} || exit -1 + scripts/travis_java_script.sh || exit -1 +fi diff --git a/src/data.h b/src/data.h index 63dd2d78f..3c4a14987 100644 --- a/src/data.h +++ b/src/data.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_DATA_H -#define XGBOOST_DATA_H /*! + * Copyright (c) 2014 by Contributors * \file data.h * \brief the input data structure for gradient boosting * \author Tianqi Chen */ +#ifndef XGBOOST_DATA_H_ +#define XGBOOST_DATA_H_ + #include #include #include "utils/utils.h" @@ -32,7 +34,7 @@ struct bst_gpair { bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {} }; -/*! +/*! * \brief extra information that might needed by gbm and tree module * these information are not necessarily presented, and can be empty */ @@ -102,7 +104,7 @@ struct RowBatch : public SparseBatch { return Inst(data_ptr + ind_ptr[i], static_cast(ind_ptr[i+1] - ind_ptr[i])); } }; -/*! +/*! * \brief read-only column batch, used to access columns, * the columns are not required to be continuous */ @@ -131,7 +133,7 @@ class IFMatrix { /*!\brief get column iterator */ virtual utils::IIterator *ColIterator(void) = 0; /*! - * \brief get the column iterator associated with FMatrix with subset of column features + * \brief get the column iterator associated with FMatrix with subset of column features * \param fset is the list of column index set that must be contained in the returning Column iterator * \return the column iterator, initialized so that it reads the elements in fset */ @@ -154,11 +156,11 @@ class IFMatrix { /*! \brief get number of non-missing entries in column */ virtual size_t GetColSize(size_t cidx) const = 0; /*! \brief get column density */ - virtual float GetColDensity(size_t cidx) const = 0; + virtual float GetColDensity(size_t cidx) const = 0; /*! \brief reference of buffered rowset */ virtual const std::vector &buffered_rowset(void) const = 0; // virtual destructor virtual ~IFMatrix(void){} }; } // namespace xgboost -#endif // XGBOOST_DATA_H +#endif // XGBOOST_DATA_H_ diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp index 3d2f36f5f..17d90e556 100644 --- a/src/gbm/gblinear-inl.hpp +++ b/src/gbm/gblinear-inl.hpp @@ -1,11 +1,13 @@ -#ifndef XGBOOST_GBM_GBLINEAR_INL_HPP_ -#define XGBOOST_GBM_GBLINEAR_INL_HPP_ /*! + * Copyright by Contributors * \file gblinear-inl.hpp * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net * the update rule is parallel coordinate descent (shotgun) * \author Tianqi Chen */ +#ifndef XGBOOST_GBM_GBLINEAR_INL_HPP_ +#define XGBOOST_GBM_GBLINEAR_INL_HPP_ + #include #include #include @@ -33,10 +35,10 @@ class GBLinear : public IGradBooster { model.param.SetParam(name, val); } } - virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { + virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*) model.LoadModel(fi); } - virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { + virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*) model.SaveModel(fo); } virtual void InitModel(void) { @@ -92,7 +94,8 @@ class GBLinear : public IGradBooster { sum_hess += p.hess * v * v; } float &w = model[fid][gid]; - bst_float dw = static_cast(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w)); + bst_float dw = static_cast(param.learning_rate * + param.CalcDelta(sum_grad, sum_hess, w)); w += dw; // update grad value for (bst_uint j = 0; j < col.length; ++j) { @@ -258,12 +261,12 @@ class GBLinear : public IGradBooster { std::fill(weight.begin(), weight.end(), 0.0f); } // save the model to file - inline void SaveModel(utils::IStream &fo) const { + inline void SaveModel(utils::IStream &fo) const { // NOLINT(*) fo.Write(¶m, sizeof(Param)); fo.Write(weight); } // load model from file - inline void LoadModel(utils::IStream &fi) { + inline void LoadModel(utils::IStream &fi) { // NOLINT(*) utils::Assert(fi.Read(¶m, sizeof(Param)) != 0, "Load LinearBooster"); fi.Read(&weight); } diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp index fe8d778e4..13ad44c57 100644 --- a/src/gbm/gbm.cpp +++ b/src/gbm/gbm.cpp @@ -1,3 +1,4 @@ +// Copyright by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h index f07d277ac..60b7474e1 100644 --- a/src/gbm/gbm.h +++ b/src/gbm/gbm.h @@ -1,11 +1,14 @@ -#ifndef XGBOOST_GBM_GBM_H_ -#define XGBOOST_GBM_GBM_H_ /*! + * Copyright by Contributors * \file gbm.h * \brief interface of gradient booster, that learns through gradient statistics * \author Tianqi Chen */ +#ifndef XGBOOST_GBM_GBM_H_ +#define XGBOOST_GBM_GBM_H_ + #include +#include #include "../data.h" #include "../utils/io.h" #include "../utils/fmap.h" @@ -13,7 +16,7 @@ namespace xgboost { /*! \brief namespace for gradient booster */ namespace gbm { -/*! +/*! * \brief interface of gradient boosting model */ class IGradBooster { @@ -29,26 +32,26 @@ class IGradBooster { * \param fi input stream * \param with_pbuffer whether the incoming data contains pbuffer */ - virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0; + virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0; // NOLINT(*) /*! * \brief save model to stream * \param fo output stream * \param with_pbuffer whether save out pbuffer */ - virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0; + virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0; // NOLINT(*) /*! * \brief initialize the model */ virtual void InitModel(void) = 0; - /*! + /*! * \brief reset the predict buffer * this will invalidate all the previous cached results * and recalculate from scratch */ virtual void ResetPredBuffer(size_t num_pbuffer) {} - /*! + /*! * \brief whether the model allow lazy checkpoint - * return true if model is only updated in DoBoost + * return true if model is only updated in DoBoost * after all Allreduce calls */ virtual bool AllowLazyCheckPoint(void) const { @@ -76,20 +79,20 @@ class IGradBooster { * the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size") * \param info extra side information that may be needed for prediction * \param out_preds output vector to hold the predictions - * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means + * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear */ virtual void Predict(IFMatrix *p_fmat, int64_t buffer_offset, const BoosterInfo &info, std::vector *out_preds, - unsigned ntree_limit = 0) = 0; + unsigned ntree_limit = 0) = 0; /*! * \brief online prediction funciton, predict score for one instance at a time * NOTE: use the batch prediction interface if possible, batch prediction is usually * more efficient than online prediction * This function is NOT threadsafe, make sure you only call from one thread - * + * * \param inst the instance you want to predict * \param out_preds output vector to hold the predictions * \param ntree_limit limit the number of trees used in prediction @@ -106,7 +109,7 @@ class IGradBooster { * \param p_fmat feature matrix * \param info extra side information that may be needed for prediction * \param out_preds output vector to hold the predictions - * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means + * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear */ virtual void PredictLeaf(IFMatrix *p_fmat, diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index c868c302a..9335ef8e7 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -1,13 +1,16 @@ -#ifndef XGBOOST_GBM_GBTREE_INL_HPP_ -#define XGBOOST_GBM_GBTREE_INL_HPP_ /*! + * Copyright by Contributors * \file gbtree-inl.hpp * \brief gradient boosted tree implementation * \author Tianqi Chen */ +#ifndef XGBOOST_GBM_GBTREE_INL_HPP_ +#define XGBOOST_GBM_GBTREE_INL_HPP_ + #include #include #include +#include #include "./gbm.h" #include "../utils/omp.h" #include "../tree/updater.h" @@ -39,7 +42,7 @@ class GBTree : public IGradBooster { tparam.SetParam(name, val); if (trees.size() == 0) mparam.SetParam(name, val); } - virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { + virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*) this->Clear(); utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0, "GBTree: invalid model file"); @@ -62,10 +65,10 @@ class GBTree : public IGradBooster { "GBTree: invalid model file"); } } - virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { + virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*) utils::Assert(mparam.num_trees == static_cast(trees.size()), "GBTree"); if (with_pbuffer) { - fo.Write(&mparam, sizeof(ModelParam)); + fo.Write(&mparam, sizeof(ModelParam)); } else { ModelParam p = mparam; p.num_pbuffer = 0; @@ -129,7 +132,7 @@ class GBTree : public IGradBooster { int64_t buffer_offset, const BoosterInfo &info, std::vector *out_preds, - unsigned ntree_limit = 0) { + unsigned ntree_limit = 0) { int nthread; #pragma omp parallel { @@ -160,12 +163,12 @@ class GBTree : public IGradBooster { this->Pred(batch[i], buffer_offset < 0 ? -1 : buffer_offset + ridx, gid, info.GetRoot(ridx), &feats, - &preds[ridx * mparam.num_output_group + gid], stride, + &preds[ridx * mparam.num_output_group + gid], stride, ntree_limit); } } } - } + } virtual void Predict(const SparseBatch::Inst &inst, std::vector *out_preds, unsigned ntree_limit, @@ -178,10 +181,10 @@ class GBTree : public IGradBooster { // loop over output groups for (int gid = 0; gid < mparam.num_output_group; ++gid) { this->Pred(inst, -1, gid, root_index, &thread_temp[0], - &(*out_preds)[gid], mparam.num_output_group, + &(*out_preds)[gid], mparam.num_output_group, ntree_limit); } - } + } virtual void PredictLeaf(IFMatrix *p_fmat, const BoosterInfo &info, std::vector *out_preds, @@ -196,7 +199,6 @@ class GBTree : public IGradBooster { thread_temp[i].Init(mparam.num_feature); } this->PredPath(p_fmat, info, out_preds, ntree_limit); - } virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) { std::vector dump; @@ -260,7 +262,7 @@ class GBTree : public IGradBooster { // update the trees for (size_t i = 0; i < updaters.size(); ++i) { updaters[i]->Update(gpair, p_fmat, info, new_trees); - } + } // optimization, update buffer, if possible // this is only under distributed column mode // for safety check of lazy checkpoint @@ -287,7 +289,7 @@ class GBTree : public IGradBooster { } // update buffer by pre-cached position inline void UpdateBufferByPosition(IFMatrix *p_fmat, - int64_t buffer_offset, + int64_t buffer_offset, int bst_group, const tree::RegTree &new_tree, const int* leaf_position) { @@ -313,11 +315,11 @@ class GBTree : public IGradBooster { int bst_group, unsigned root_index, tree::RegTree::FVec *p_feats, - float *out_pred, size_t stride, + float *out_pred, size_t stride, unsigned ntree_limit) { size_t itop = 0; float psum = 0.0f; - // sum of leaf vector + // sum of leaf vector std::vector vec_psum(mparam.size_leaf_vector, 0.0f); const int64_t bid = mparam.BufferOffset(buffer_index, bst_group); // number of valid trees @@ -339,7 +341,7 @@ class GBTree : public IGradBooster { for (int j = 0; j < mparam.size_leaf_vector; ++j) { vec_psum[j] += trees[i]->leafvec(tid)[j]; } - if(--treeleft == 0) break; + if (--treeleft == 0) break; } } p_feats->Drop(inst); @@ -365,7 +367,7 @@ class GBTree : public IGradBooster { // number of valid trees if (ntree_limit == 0 || ntree_limit > trees.size()) { ntree_limit = static_cast(trees.size()); - } + } std::vector &preds = *out_preds; preds.resize(info.num_row * ntree_limit); // start collecting the prediction @@ -389,7 +391,7 @@ class GBTree : public IGradBooster { } } } - + // --- data structure --- /*! \brief training parameters */ struct TrainParam { @@ -442,10 +444,10 @@ class GBTree : public IGradBooster { int num_feature; /*! \brief size of predicton buffer allocated used for buffering */ int64_t num_pbuffer; - /*! + /*! * \brief how many output group a single instance can produce * this affects the behavior of number of output we have: - * suppose we have n instance and k group, output will be k*n + * suppose we have n instance and k group, output will be k*n */ int num_output_group; /*! \brief size of leaf vector needed in tree */ @@ -478,8 +480,8 @@ class GBTree : public IGradBooster { inline size_t PredBufferSize(void) const { return num_output_group * num_pbuffer * (size_leaf_vector + 1); } - /*! - * \brief get the buffer offset given a buffer index and group id + /*! + * \brief get the buffer offset given a buffer index and group id * \return calculated buffer offset */ inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const { diff --git a/src/io/dmlc_simple.cpp b/src/io/dmlc_simple.cpp index 065877a19..3fbf34734 100644 --- a/src/io/dmlc_simple.cpp +++ b/src/io/dmlc_simple.cpp @@ -1,6 +1,8 @@ +// Copyright by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX +#include #include "../utils/io.h" // implements a single no split version of DMLC @@ -9,7 +11,7 @@ namespace xgboost { namespace utils { /*! - * \brief line split implementation from single FILE + * \brief line split implementation from single FILE * simply returns lines of files, used for stdin */ class SingleFileSplit : public dmlc::InputSplit { @@ -32,7 +34,7 @@ class SingleFileSplit : public dmlc::InputSplit { } virtual size_t Read(void *ptr, size_t size) { return std::fread(ptr, 1, size, fp_); - } + } virtual void Write(const void *ptr, size_t size) { utils::Error("cannot do write in inputsplit"); } @@ -47,13 +49,13 @@ class SingleFileSplit : public dmlc::InputSplit { chunk_end_); out_rec->dptr = chunk_begin_; out_rec->size = next - chunk_begin_; - chunk_begin_ = next; + chunk_begin_ = next; return true; } virtual bool NextChunk(Blob *out_chunk) { if (chunk_begin_ == chunk_end_) { if (!LoadChunk()) return false; - } + } out_chunk->dptr = chunk_begin_; out_chunk->size = chunk_end_ - chunk_begin_; chunk_begin_ = chunk_end_; @@ -64,8 +66,8 @@ class SingleFileSplit : public dmlc::InputSplit { if (max_size <= overflow_.length()) { *size = 0; return true; } - if (overflow_.length() != 0) { - std::memcpy(buf, BeginPtr(overflow_), overflow_.length()); + if (overflow_.length() != 0) { + std::memcpy(buf, BeginPtr(overflow_), overflow_.length()); } size_t olen = overflow_.length(); overflow_.resize(0); @@ -88,13 +90,13 @@ class SingleFileSplit : public dmlc::InputSplit { return true; } } - + protected: inline const char* FindLastRecordBegin(const char *begin, const char *end) { if (begin == end) return begin; for (const char *p = end - 1; p != begin; --p) { - if (*p == '\n' || *p == '\r') return p + 1; + if (*p == '\n' || *p == '\r') return p + 1; } return begin; } @@ -143,7 +145,7 @@ class StdFile : public dmlc::Stream { public: explicit StdFile(std::FILE *fp, bool use_stdio) : fp(fp), use_stdio(use_stdio) { - } + } virtual ~StdFile(void) { this->Close(); } @@ -154,7 +156,7 @@ class StdFile : public dmlc::Stream { std::fwrite(ptr, size, 1, fp); } virtual void Seek(size_t pos) { - std::fseek(fp, static_cast(pos), SEEK_SET); + std::fseek(fp, static_cast(pos), SEEK_SET); // NOLINT(*) } virtual size_t Tell(void) { return std::ftell(fp); @@ -197,7 +199,7 @@ Stream *Stream::Create(const char *fname, const char * const mode, bool allow_nu "to use hdfs, s3 or distributed version, compile with make dmlc=1"; utils::Check(strncmp(fname, "s3://", 5) != 0, msg); utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg); - + std::FILE *fp = NULL; bool use_stdio = false; using namespace std; diff --git a/src/io/io.cpp b/src/io/io.cpp index dd4336170..b3713f0c5 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -1,3 +1,4 @@ +// Copyright 2014 by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX @@ -17,7 +18,7 @@ DataMatrix* LoadDataMatrix(const char *fname, const char *cache_file) { using namespace std; std::string fname_ = fname; - + const char *dlm = strchr(fname, '#'); if (dlm != NULL) { utils::Check(strchr(dlm + 1, '#') == NULL, @@ -29,7 +30,7 @@ DataMatrix* LoadDataMatrix(const char *fname, cache_file = dlm +1; } - if (cache_file == NULL) { + if (cache_file == NULL) { if (!std::strcmp(fname, "stdin") || !std::strncmp(fname, "s3://", 5) || !std::strncmp(fname, "hdfs://", 7) || @@ -42,7 +43,7 @@ DataMatrix* LoadDataMatrix(const char *fname, utils::FileStream fs(utils::FopenCheck(fname, "rb")); utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); fs.Seek(0); - if (magic == DMatrixSimple::kMagic) { + if (magic == DMatrixSimple::kMagic) { DMatrixSimple *dmat = new DMatrixSimple(); dmat->LoadBinary(fs, silent, fname); fs.Close(); @@ -81,7 +82,7 @@ DataMatrix* LoadDataMatrix(const char *fname, } } -void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { +void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { if (dmat.magic == DMatrixSimple::kMagic) { const DMatrixSimple *p_dmat = static_cast(&dmat); p_dmat->SaveBinary(fname, silent); diff --git a/src/io/io.h b/src/io/io.h index ed075977c..267bb0bff 100644 --- a/src/io/io.h +++ b/src/io/io.h @@ -1,11 +1,13 @@ -#ifndef XGBOOST_IO_IO_H_ -#define XGBOOST_IO_IO_H_ /*! + * Copyright 2014 by Contributors * \file io.h * \brief handles input data format of xgboost * I/O module handles a specific DMatrix format * \author Tianqi Chen */ +#ifndef XGBOOST_IO_IO_H_ +#define XGBOOST_IO_IO_H_ + #include "../data.h" #include "../learner/dmatrix.h" @@ -32,7 +34,7 @@ DataMatrix* LoadDataMatrix(const char *fname, bool loadsplit, const char *cache_file = NULL); /*! - * \brief save DataMatrix into stream, + * \brief save DataMatrix into stream, * note: the saved dmatrix format may not be in exactly same as input * SaveDMatrix will choose the best way to materialize the dmatrix. * \param dmat the dmatrix to be saved @@ -40,7 +42,6 @@ DataMatrix* LoadDataMatrix(const char *fname, * \param silent whether print message during saving */ void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false); - } // namespace io } // namespace xgboost #endif // XGBOOST_IO_IO_H_ diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h index 0e69d0467..92eeaf35d 100644 --- a/src/io/libsvm_parser.h +++ b/src/io/libsvm_parser.h @@ -22,7 +22,7 @@ namespace io { /*! \brief page returned by libsvm parser */ struct LibSVMPage : public SparsePage { std::vector label; - // overload clear + // overload clear inline void Clear() { SparsePage::Clear(); label.clear(); @@ -35,7 +35,7 @@ struct LibSVMPage : public SparsePage { */ class LibSVMPageFactory { public: - explicit LibSVMPageFactory() + LibSVMPageFactory() : bytes_read_(0), at_head_(true) { } inline bool Init(void) { @@ -85,7 +85,7 @@ class LibSVMPageFactory { data->resize(nthread); bytes_read_ += chunk.size; utils::Assert(chunk.size != 0, "LibSVMParser.FileData"); - char *head = reinterpret_cast(chunk.dptr); + char *head = reinterpret_cast(chunk.dptr); #pragma omp parallel num_threads(nthread_) { // threadid @@ -150,7 +150,7 @@ class LibSVMPageFactory { } return begin; } - + private: // nthread int nthread_; @@ -199,12 +199,13 @@ class LibSVMParser : public utils::IIterator { inline size_t bytes_read(void) const { return itr.get_factory().bytes_read(); } + private: bool at_end_; size_t data_ptr_; std::vector *data_; utils::ThreadBuffer*, LibSVMPageFactory> itr; -}; +}; } // namespace io } // namespace xgboost diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 79455d130..3012af564 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -1,11 +1,15 @@ -#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ -#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ /*! + * Copyright (c) 2014 by Contributors * \file page_dmatrix-inl.hpp * row iterator based on sparse page * \author Tianqi Chen */ +#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ +#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ + #include +#include +#include #include "../data.h" #include "../utils/iterator.h" #include "../utils/thread_buffer.h" @@ -94,12 +98,12 @@ class DMatrixPageBase : public DataMatrix { fbin.Close(); if (!silent) { utils::Printf("DMatrixPage: %lux%lu is saved to %s\n", - static_cast(mat.info.num_row()), - static_cast(mat.info.num_col()), fname_); + static_cast(mat.info.num_row()), // NOLINT(*) + static_cast(mat.info.num_col()), fname_); // NOLINT(*) } } /*! \brief load and initialize the iterator with fi */ - inline void LoadBinary(utils::FileStream &fi, + inline void LoadBinary(utils::FileStream &fi, // NOLINT(*) bool silent, const char *fname_) { this->set_cache_file(fname_); @@ -114,8 +118,8 @@ class DMatrixPageBase : public DataMatrix { iter_->Load(fs); if (!silent) { utils::Printf("DMatrixPage: %lux%lu matrix is loaded", - static_cast(info.num_row()), - static_cast(info.num_col())); + static_cast(info.num_row()), // NOLINT(*) + static_cast(info.num_col())); // NOLINT(*) if (fname_ != NULL) { utils::Printf(" from %s\n", fname_); } else { @@ -141,7 +145,7 @@ class DMatrixPageBase : public DataMatrix { } this->set_cache_file(cache_file); std::string fname_row = std::string(cache_file) + ".row.blob"; - utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); + utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); SparsePage page; size_t bytes_write = 0; double tstart = rabit::utils::GetTime(); @@ -178,8 +182,8 @@ class DMatrixPageBase : public DataMatrix { if (page.data.size() != 0) { page.Save(&fo); } - fo.Close(); - iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb"))); + fo.Close(); + iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb"))); // save data matrix utils::FileStream fs(utils::FopenCheck(cache_file, "wb")); int tmagic = kMagic; @@ -188,8 +192,8 @@ class DMatrixPageBase : public DataMatrix { fs.Close(); if (!silent) { utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n", - static_cast(info.num_row()), - static_cast(info.num_col()), + static_cast(info.num_row()), // NOLINT(*) + static_cast(info.num_col()), // NOLINT(*) uri); } } @@ -241,12 +245,12 @@ class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> { virtual IFMatrix *fmat(void) const { return fmat_; } - virtual void set_cache_file(const std::string &cache_file) { + virtual void set_cache_file(const std::string &cache_file) { } virtual void CheckMagic(int tmagic) { utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic || tmagic == DMatrixPageBase<0xffffab03>::kMagic, - "invalid format,magic number mismatch"); + "invalid format,magic number mismatch"); } /*! \brief the real fmatrix */ IFMatrix *fmat_; diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 18f4c6dee..2aaec5b19 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -1,10 +1,16 @@ -#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ -#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ /*! + * Copyright (c) 2014 by Contributors * \file page_fmatrix-inl.hpp * col iterator based on sparse page * \author Tianqi Chen */ +#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ +#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ + +#include +#include +#include + namespace xgboost { namespace io { /*! \brief thread buffer iterator */ @@ -42,9 +48,9 @@ class ThreadColPageIterator: public utils::IIterator { } // set index set inline void SetIndexSet(const std::vector &fset, bool load_all) { - itr.get_factory().SetIndexSet(fset, load_all); + itr.get_factory().SetIndexSet(fset, load_all); } - + private: // output data ColBatch out_; @@ -96,7 +102,7 @@ struct ColConvertFactory { return true; } } - if (tmp_.Size() != 0){ + if (tmp_.Size() != 0) { this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop, *enabled_, val); return true; @@ -104,7 +110,7 @@ struct ColConvertFactory { return false; } } - inline void Destroy(void) {} + inline void Destroy(void) {} inline void BeforeFirst(void) {} inline void MakeColPage(const SparsePage &prow, const bst_uint *ridx, @@ -115,7 +121,7 @@ struct ColConvertFactory { #pragma omp parallel { nthread = omp_get_num_threads(); - int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1); + int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1); if (nthread > max_nthread) { nthread = max_nthread; } @@ -130,10 +136,10 @@ struct ColConvertFactory { int tid = omp_get_thread_num(); for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) { const SparseBatch::Entry &e = prow.data[j]; - if (enabled[e.index]) { + if (enabled[e.index]) { builder.AddBudget(e.index, tid); } - } + } } builder.InitStorage(); #pragma omp parallel for schedule(static) num_threads(nthread) @@ -169,7 +175,7 @@ struct ColConvertFactory { // buffered rowset std::vector *buffered_rowset_; // enabled marks - const std::vector *enabled_; + const std::vector *enabled_; // internal temp cache SparsePage tmp_; /*! \brief page size 256 M */ @@ -191,7 +197,7 @@ class FMatrixPage : public IFMatrix { if (iter_ != NULL) delete iter_; } /*! \return whether column access is enabled */ - virtual bool HaveColAccess(void) const { + virtual bool HaveColAccess(void) const { return col_size_.size() != 0; } /*! \brief get number of colmuns */ @@ -212,7 +218,7 @@ class FMatrixPage : public IFMatrix { size_t nmiss = num_buffered_row_ - (col_size_[cidx]); return 1.0f - (static_cast(nmiss)) / num_buffered_row_; } - virtual void InitColAccess(const std::vector &enabled, + virtual void InitColAccess(const std::vector &enabled, float pkeep, size_t max_row_perbatch) { if (this->HaveColAccess()) return; if (TryLoadColData()) return; @@ -242,11 +248,11 @@ class FMatrixPage : public IFMatrix { /*! * \brief colmun based iterator */ - virtual utils::IIterator *ColIterator(const std::vector &fset) { + virtual utils::IIterator *ColIterator(const std::vector &fset) { size_t ncol = this->NumCol(); col_index_.resize(0); for (size_t i = 0; i < fset.size(); ++i) { - if (fset[i] < ncol) col_index_.push_back(fset[i]); + if (fset[i] < ncol) col_index_.push_back(fset[i]); } col_iter_.SetIndexSet(col_index_, false); col_iter_.BeforeFirst(); @@ -255,13 +261,13 @@ class FMatrixPage : public IFMatrix { // set the cache file name inline void set_cache_file(const std::string &cache_file) { col_data_name_ = std::string(cache_file) + ".col.blob"; - col_meta_name_ = std::string(cache_file) + ".col.meta"; + col_meta_name_ = std::string(cache_file) + ".col.meta"; } protected: inline bool TryLoadColData(void) { std::FILE *fi = fopen64(col_meta_name_.c_str(), "rb"); - if (fi == NULL) return false; + if (fi == NULL) return false; utils::FileStream fs(fi); LoadMeta(&fs); fs.Close(); @@ -306,12 +312,12 @@ class FMatrixPage : public IFMatrix { SparsePage *pcol; while (citer.Next(pcol)) { for (size_t i = 0; i < pcol->Size(); ++i) { - col_size_[i] += pcol->offset[i + 1] - pcol->offset[i]; + col_size_[i] += pcol->offset[i + 1] - pcol->offset[i]; } pcol->Save(&fo); size_t spage = pcol->MemCostBytes(); bytes_write += spage; - double tnow = rabit::utils::GetTime(); + double tnow = rabit::utils::GetTime(); double tdiff = tnow - tstart; utils::Printf("Writting to %s in %g MB/s, %lu MB written current speed:%g MB/s\n", col_data_name_.c_str(), diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 3876c21ad..190cbdcdf 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -1,13 +1,15 @@ -#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ -#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file simple_dmatrix-inl.hpp - * \brief simple implementation of DMatrixS that can be used + * \brief simple implementation of DMatrixS that can be used * the data format of xgboost is templatized, which means it can accept * any data structure that implements the function defined by FMatrix * this file is a specific implementation of input data structure that can be used by BoostLearner * \author Tianqi Chen */ +#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ +#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ + #include #include #include @@ -119,13 +121,13 @@ class DMatrixSimple : public DataMatrix { for (size_t i = 0; i < batch.data.size(); ++i) { info.info.num_col = std::max(info.info.num_col, static_cast(batch.data[i].index+1)); - } + } } if (!silent) { utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size()), uri); + static_cast(info.num_row()), // NOLINT(*) + static_cast(info.num_col()), // NOLINT(*) + static_cast(row_data_.size()), uri); // NOLINT(*) } // try to load in additional file if (!loadsplit) { @@ -141,7 +143,7 @@ class DMatrixSimple : public DataMatrix { "DMatrix: weight data does not match the number of rows in features"); } std::string mname = name + ".base_margin"; - if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) { + if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) { } } } @@ -165,10 +167,11 @@ class DMatrixSimple : public DataMatrix { * \param silent whether print information during loading * \param fname file name, used to print message */ - inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { + inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { // NOLINT(*) int tmagic; utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); - utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname); + utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", + fname == NULL ? "" : fname); info.LoadBinary(fs); LoadBinary(fs, &row_ptr_, &row_data_); @@ -176,9 +179,9 @@ class DMatrixSimple : public DataMatrix { if (!silent) { utils::Printf("%lux%lu matrix with %lu entries is loaded", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size())); + static_cast(info.num_row()), // NOLINT(*) + static_cast(info.num_col()), // NOLINT(*) + static_cast(row_data_.size())); // NOLINT(*) if (fname != NULL) { utils::Printf(" from %s\n", fname); } else { @@ -205,9 +208,9 @@ class DMatrixSimple : public DataMatrix { if (!silent) { utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n", - static_cast(info.num_row()), - static_cast(info.num_col()), - static_cast(row_data_.size()), fname); + static_cast(info.num_row()), // NOLINT(*) + static_cast(info.num_col()), // NOLINT(*) + static_cast(row_data_.size()), fname); // NOLINT(*) if (info.group_ptr.size() != 0) { utils::Printf("data contains %u groups\n", static_cast(info.group_ptr.size()-1)); @@ -256,7 +259,7 @@ class DMatrixSimple : public DataMatrix { * \param ptr pointer data * \param data data content */ - inline static void SaveBinary(utils::IStream &fo, + inline static void SaveBinary(utils::IStream &fo, // NOLINT(*) const std::vector &ptr, const std::vector &data) { size_t nrow = ptr.size() - 1; @@ -272,7 +275,7 @@ class DMatrixSimple : public DataMatrix { * \param out_ptr pointer data * \param out_data data content */ - inline static void LoadBinary(utils::IStream &fi, + inline static void LoadBinary(utils::IStream &fi, // NOLINT(*) std::vector *out_ptr, std::vector *out_data) { size_t nrow; @@ -314,7 +317,7 @@ class DMatrixSimple : public DataMatrix { DMatrixSimple *parent_; // temporal space for batch RowBatch batch_; - }; + }; }; } // namespace io } // namespace xgboost diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index fc6aab8f9..0e0da4461 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -1,11 +1,15 @@ -#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ -#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file simple_fmatrix-inl.hpp * \brief the input data structure for gradient boosting * \author Tianqi Chen */ +#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ +#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ + #include +#include +#include #include "../data.h" #include "../utils/utils.h" #include "../utils/random.h" @@ -30,7 +34,7 @@ class FMatrixS : public IFMatrix { } // destructor virtual ~FMatrixS(void) { - if (iter_ != NULL) delete iter_; + if (iter_ != NULL) delete iter_; } /*! \return whether column access is enabled */ virtual bool HaveColAccess(void) const { @@ -39,7 +43,7 @@ class FMatrixS : public IFMatrix { /*! \brief get number of colmuns */ virtual size_t NumCol(void) const { utils::Check(this->HaveColAccess(), "NumCol:need column access"); - return col_size_.size() - 1; + return col_size_.size(); } /*! \brief get number of buffered rows */ virtual const std::vector &buffered_rowset(void) const { @@ -54,7 +58,7 @@ class FMatrixS : public IFMatrix { size_t nmiss = buffered_rowset_.size() - col_size_[cidx]; return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); } - virtual void InitColAccess(const std::vector &enabled, + virtual void InitColAccess(const std::vector &enabled, float pkeep, size_t max_row_perbatch) { if (this->HaveColAccess()) return; this->InitColData(enabled, pkeep, max_row_perbatch); @@ -85,7 +89,7 @@ class FMatrixS : public IFMatrix { size_t ncol = this->NumCol(); col_iter_.col_index_.resize(0); for (size_t i = 0; i < fset.size(); ++i) { - if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); + if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); } col_iter_.BeforeFirst(); return &col_iter_; @@ -94,7 +98,7 @@ class FMatrixS : public IFMatrix { * \brief save column access data into stream * \param fo output stream to save to */ - inline void SaveColAccess(utils::IStream &fo) const { + inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*) size_t n = 0; fo.Write(&n, sizeof(n)); } @@ -102,10 +106,10 @@ class FMatrixS : public IFMatrix { * \brief load column access data from stream * \param fo output stream to load from */ - inline void LoadColAccess(utils::IStream &fi) { + inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*) // do nothing in load col access } - + protected: /*! * \brief intialize column data @@ -129,7 +133,7 @@ class FMatrixS : public IFMatrix { for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) { SparsePage *pcol = col_iter_.cpages_[i]; for (size_t j = 0; j < pcol->Size(); ++j) { - col_size_[j] += pcol->offset[j + 1] - pcol->offset[j]; + col_size_[j] += pcol->offset[j + 1] - pcol->offset[j]; } } } @@ -139,7 +143,7 @@ class FMatrixS : public IFMatrix { * \param pcol the target column */ inline void MakeOneBatch(const std::vector &enabled, - float pkeep, + float pkeep, SparsePage *pcol) { // clear rowset buffered_rowset_.clear(); @@ -153,14 +157,14 @@ class FMatrixS : public IFMatrix { pcol->Clear(); utils::ParallelGroupBuilder builder(&pcol->offset, &pcol->data); - builder.InitBudget(0, nthread); + builder.InitBudget(info_.num_col(), nthread); // start working iter_->BeforeFirst(); while (iter_->Next()) { const RowBatch &batch = iter_->Value(); bmap.resize(bmap.size() + batch.size, true); - long batch_size = static_cast(batch.size); - for (long i = 0; i < batch_size; ++i) { + long batch_size = static_cast(batch.size); // NOLINT(*) + for (long i = 0; i < batch_size; ++i) { // NOLINT(*) bst_uint ridx = static_cast(batch.base_rowid + i); if (pkeep == 1.0f || random::SampleBinary(pkeep)) { buffered_rowset_.push_back(ridx); @@ -169,13 +173,13 @@ class FMatrixS : public IFMatrix { } } #pragma omp parallel for schedule(static) - for (long i = 0; i < batch_size; ++i) { + for (long i = 0; i < batch_size; ++i) { // NOLINT(*) int tid = omp_get_thread_num(); bst_uint ridx = static_cast(batch.base_rowid + i); if (bmap[ridx]) { RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { - if (enabled[inst[j].index]){ + if (enabled[inst[j].index]) { builder.AddBudget(inst[j].index, tid); } } @@ -183,18 +187,18 @@ class FMatrixS : public IFMatrix { } } builder.InitStorage(); - + iter_->BeforeFirst(); while (iter_->Next()) { const RowBatch &batch = iter_->Value(); #pragma omp parallel for schedule(static) - for (long i = 0; i < static_cast(batch.size); ++i) { + for (long i = 0; i < static_cast(batch.size); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); bst_uint ridx = static_cast(batch.base_rowid + i); if (bmap[ridx]) { RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { - if (enabled[inst[j].index]) { + if (enabled[inst[j].index]) { builder.Push(inst[j].index, Entry((bst_uint)(batch.base_rowid+i), inst[j].fvalue), tid); @@ -204,7 +208,8 @@ class FMatrixS : public IFMatrix { } } - utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data"); + utils::Assert(pcol->Size() == info_.num_col(), + "inconsistent col data"); // sort columns bst_omp_uint ncol = static_cast(pcol->Size()); #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) @@ -260,7 +265,7 @@ class FMatrixS : public IFMatrix { #pragma omp parallel { nthread = omp_get_num_threads(); - int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); + int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); if (nthread > max_nthread) { nthread = max_nthread; } @@ -276,7 +281,7 @@ class FMatrixS : public IFMatrix { RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { const SparseBatch::Entry &e = inst[j]; - if (enabled[e.index]) { + if (enabled[e.index]) { builder.AddBudget(e.index, tid); } } @@ -329,10 +334,10 @@ class FMatrixS : public IFMatrix { static_cast(pcol->offset[ridx + 1] - pcol->offset[ridx])); } batch_.col_index = BeginPtr(col_index_); - batch_.col_data = BeginPtr(col_data_); + batch_.col_data = BeginPtr(col_data_); return true; } - virtual const ColBatch &Value(void) const { + virtual const ColBatch &Value(void) const { return batch_; } inline void Clear(void) { @@ -346,7 +351,7 @@ class FMatrixS : public IFMatrix { // column content std::vector col_data_; // column sparse pages - std::vector cpages_; + std::vector cpages_; // data pointer size_t data_ptr_; // temporal space for batch @@ -356,7 +361,7 @@ class FMatrixS : public IFMatrix { // column iterator ColBatchIter col_iter_; // shared meta info with DMatrix - const learner::MetaInfo &info_; + const learner::MetaInfo &info_; // row iterator utils::IIterator *iter_; /*! \brief list of row index that are buffered */ @@ -366,4 +371,4 @@ class FMatrixS : public IFMatrix { }; } // namespace io } // namespace xgboost -#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP +#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_ diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h index d94141a6e..24546f785 100644 --- a/src/io/sparse_batch_page.h +++ b/src/io/sparse_batch_page.h @@ -1,18 +1,22 @@ -#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_ -#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_ /*! + * Copyright (c) 2014 by Contributors * \file sparse_batch_page.h * content holder of sparse batch that can be saved to disk * the representation can be effectively * use in external memory computation * \author Tianqi Chen */ +#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_ +#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_ + +#include +#include #include "../data.h" namespace xgboost { namespace io { /*! - * \brief storage unit of sparse batch + * \brief storage unit of sparse batch */ class SparsePage { public: @@ -96,7 +100,7 @@ class SparsePage { } /*! * \brief save the data to fo, when a page was written - * to disk it must contain all the elements in the + * to disk it must contain all the elements in the * \param fo output stream */ inline void Save(utils::IStream *fo) const { @@ -124,7 +128,7 @@ class SparsePage { */ inline bool PushLoad(utils::IStream *fi) { if (!fi->Read(&disk_offset_)) return false; - data.resize(offset.back() + disk_offset_.back()); + data.resize(offset.back() + disk_offset_.back()); if (disk_offset_.back() != 0) { utils::Check(fi->Read(BeginPtr(data) + offset.back(), disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0, @@ -138,7 +142,7 @@ class SparsePage { } return true; } - /*! + /*! * \brief Push row batch into the page * \param batch the row batch */ @@ -154,7 +158,7 @@ class SparsePage { offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0]; } } - /*! + /*! * \brief Push a sparse page * \param batch the row page */ @@ -170,7 +174,7 @@ class SparsePage { offset[i + begin] = top + batch.offset[i + 1]; } } - /*! + /*! * \brief Push one instance into page * \param row an instance row */ @@ -202,7 +206,7 @@ class SparsePage { }; /*! * \brief factory class for SparsePage, - * used in threadbuffer template + * used in threadbuffer template */ class SparsePageFactory { public: @@ -217,7 +221,7 @@ class SparsePageFactory { return action_index_set_; } // set index set, will be used after next before first - inline void SetIndexSet(const std::vector &index_set, + inline void SetIndexSet(const std::vector &index_set, bool load_all) { set_load_all_ = load_all; if (!set_load_all_) { @@ -229,7 +233,7 @@ class SparsePageFactory { return true; } inline void SetParam(const char *name, const char *val) {} - inline bool LoadNext(SparsePage *val) { + inline bool LoadNext(SparsePage *val) { if (!action_load_all_) { if (action_index_set_.size() == 0) { return false; diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index b58f7b2bb..3fbc579de 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -1,11 +1,13 @@ -#ifndef XGBOOST_LEARNER_DMATRIX_H_ -#define XGBOOST_LEARNER_DMATRIX_H_ /*! + * Copyright 2014 by Contributors * \file dmatrix.h - * \brief meta data and template data structure + * \brief meta data and template data structure * used for regression/classification/ranking * \author Tianqi Chen */ +#ifndef XGBOOST_LEARNER_DMATRIX_H_ +#define XGBOOST_LEARNER_DMATRIX_H_ + #include #include #include "../data.h" @@ -16,8 +18,8 @@ namespace learner { * \brief meta information needed in training, including label, weight */ struct MetaInfo { - /*! - * \brief information needed by booster + /*! + * \brief information needed by booster * BoosterInfo does not implement save and load, * all serialization is done in MetaInfo */ @@ -31,7 +33,7 @@ struct MetaInfo { std::vector group_ptr; /*! \brief weights of each instance, optional */ std::vector weights; - /*! + /*! * \brief initialized margins, * if specified, xgboost will start from this init margin * can be used to specify initial prediction to boost from @@ -66,7 +68,7 @@ struct MetaInfo { return 1.0f; } } - inline void SaveBinary(utils::IStream &fo) const { + inline void SaveBinary(utils::IStream &fo) const { // NOLINT(*) int version = kVersion; fo.Write(&version, sizeof(version)); fo.Write(&info.num_row, sizeof(info.num_row)); @@ -77,7 +79,7 @@ struct MetaInfo { fo.Write(info.root_index); fo.Write(base_margin); } - inline void LoadBinary(utils::IStream &fi) { + inline void LoadBinary(utils::IStream &fi) { // NOLINT(*) int version; utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format"); utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format"); @@ -114,7 +116,7 @@ struct MetaInfo { return labels; } inline const std::vector& GetFloatInfo(const char *field) const { - return ((MetaInfo*)this)->GetFloatInfo(field); + return ((MetaInfo*)this)->GetFloatInfo(field); // NOLINT(*) } inline std::vector &GetUIntInfo(const char *field) { using namespace std; @@ -124,7 +126,7 @@ struct MetaInfo { return info.root_index; } inline const std::vector &GetUIntInfo(const char *field) const { - return ((MetaInfo*)this)->GetUIntInfo(field); + return ((MetaInfo*)this)->GetUIntInfo(field); // NOLINT(*) } // try to load weight information from file, if exists inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) { @@ -149,14 +151,14 @@ struct MetaInfo { * \tparam FMatrix type of feature data source */ struct DMatrix { - /*! - * \brief magic number associated with this object + /*! + * \brief magic number associated with this object * used to check if it is specific instance */ const int magic; /*! \brief meta information about the dataset */ MetaInfo info; - /*! + /*! * \brief cache pointer to verify if the data structure is cached in some learner * used to verify if DMatrix is cached */ diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 433b5a00b..2b69a43a8 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -1,10 +1,12 @@ +/*! + * Copyright 2014 by Contributors + * \file xgboost_evaluation-inl.hpp + * \brief evaluation metrics for regression and classification and rank + * \author Kailong Chen, Tianqi Chen + */ #ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_ #define XGBOOST_LEARNER_EVALUATION_INL_HPP_ -/*! -* \file xgboost_evaluation-inl.hpp -* \brief evaluation metrics for regression and classification and rank -* \author Kailong Chen, Tianqi Chen -*/ + #include #include #include @@ -18,8 +20,8 @@ namespace xgboost { namespace learner { -/*! - * \brief base class of elementwise evaluation +/*! + * \brief base class of elementwise evaluation * \tparam Derived the name of subclass */ template @@ -47,15 +49,15 @@ struct EvalEWiseBase : public IEvaluator { } return Derived::GetFinal(dat[0], dat[1]); } - /*! - * \brief to be implemented by subclass, - * get evaluation result from one row + /*! + * \brief to be implemented by subclass, + * get evaluation result from one row * \param label label of current instance * \param pred prediction value of current instance */ inline static float EvalRow(float label, float pred); - /*! - * \brief to be overide by subclas, final trasnformation + /*! + * \brief to be overide by subclas, final trasnformation * \param esum the sum statistics returned by EvalRow * \param wsum sum of weight */ @@ -87,9 +89,9 @@ struct EvalLogLoss : public EvalEWiseBase { const float eps = 1e-16f; const float pneg = 1.0f - py; if (py < eps) { - return -y * std::log(eps) - (1.0f - y) * std::log(1.0f - eps); + return -y * std::log(eps) - (1.0f - y) * std::log(1.0f - eps); } else if (pneg < eps) { - return -y * std::log(1.0f - eps) - (1.0f - y) * std::log(eps); + return -y * std::log(1.0f - eps) - (1.0f - y) * std::log(eps); } else { return -y * std::log(py) - (1.0f - y) * std::log(pneg); } @@ -119,7 +121,7 @@ struct EvalPoissionNegLogLik : public EvalEWiseBase { } }; -/*! +/*! * \brief base class of multi-class evaluation * \tparam Derived the name of subclass */ @@ -139,7 +141,7 @@ struct EvalMClassBase : public IEvaluator { float sum = 0.0, wsum = 0.0; int label_error = 0; #pragma omp parallel for reduction(+: sum, wsum) schedule(static) - for (bst_omp_uint i = 0; i < ndata; ++i) { + for (bst_omp_uint i = 0; i < ndata; ++i) { const float wt = info.GetWeight(i); int label = static_cast(info.labels[i]); if (label >= 0 && label < static_cast(nclass)) { @@ -161,18 +163,18 @@ struct EvalMClassBase : public IEvaluator { } return Derived::GetFinal(dat[0], dat[1]); } - /*! - * \brief to be implemented by subclass, - * get evaluation result from one row + /*! + * \brief to be implemented by subclass, + * get evaluation result from one row * \param label label of current instance - * \param pred prediction value of current instance + * \param pred prediction value of current instance * \param nclass number of class in the prediction */ inline static float EvalRow(int label, const float *pred, size_t nclass); - /*! - * \brief to be overide by subclas, final trasnformation + /*! + * \brief to be overide by subclas, final trasnformation * \param esum the sum statistics returned by EvalRow * \param wsum sum of weight */ @@ -208,7 +210,7 @@ struct EvalMultiLogLoss : public EvalMClassBase { } else { return -std::log(eps); } - } + } }; /*! \brief ctest */ @@ -240,7 +242,7 @@ struct EvalCTest: public IEvaluator { tpred.push_back(preds[i + (k + 1) * ndata]); tinfo.labels.push_back(info.labels[i]); tinfo.weights.push_back(info.GetWeight(i)); - } + } } wsum += base_->Eval(tpred, tinfo); } @@ -328,7 +330,7 @@ struct EvalPrecisionRatio : public IEvaluator{ const MetaInfo &info, bool distributed) const { utils::Check(!distributed, "metric %s do not support distributed evaluation", Name()); - utils::Check(info.labels.size() != 0, "label set cannot be empty"); + utils::Check(info.labels.size() != 0, "label set cannot be empty"); utils::Assert(preds.size() % info.labels.size() == 0, "label size predict size not match"); std::vector< std::pair > rec; @@ -344,7 +346,8 @@ struct EvalPrecisionRatio : public IEvaluator{ } protected: - inline double CalcPRatio(const std::vector< std::pair >& rec, const MetaInfo &info) const { + inline double CalcPRatio(const std::vector< std::pair >& rec, + const MetaInfo &info) const { size_t cutoff = static_cast(ratio_ * rec.size()); double wt_hit = 0.0, wsum = 0.0, wt_sum = 0.0; for (size_t j = 0; j < cutoff; ++j) { @@ -372,7 +375,7 @@ struct EvalAuc : public IEvaluator { utils::Check(info.labels.size() != 0, "label set cannot be empty"); utils::Check(preds.size() % info.labels.size() == 0, "label size predict size not match"); - std::vector tgptr(2, 0); + std::vector tgptr(2, 0); tgptr[1] = static_cast(info.labels.size()); const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; @@ -417,8 +420,8 @@ struct EvalAuc : public IEvaluator { } if (distributed) { float dat[2]; - dat[0] = static_cast(sum_auc); - dat[1] = static_cast(ngroup); + dat[0] = static_cast(sum_auc); + dat[1] = static_cast(ngroup); // approximately estimate auc using mean rabit::Allreduce(dat, 2); return dat[0] / dat[1]; @@ -463,8 +466,8 @@ struct EvalRankList : public IEvaluator { } if (distributed) { float dat[2]; - dat[0] = static_cast(sum_metric); - dat[1] = static_cast(ngroup); + dat[0] = static_cast(sum_metric); + dat[1] = static_cast(ngroup); // approximately estimate auc using mean rabit::Allreduce(dat, 2); return dat[0] / dat[1]; @@ -489,7 +492,7 @@ struct EvalRankList : public IEvaluator { } } /*! \return evaluation metric, given the pair_sort record, (pred,label) */ - virtual float EvalMetric(std::vector< std::pair > &pair_sort) const = 0; + virtual float EvalMetric(std::vector< std::pair > &pair_sort) const = 0; // NOLINT(*) protected: unsigned topn_; @@ -524,13 +527,13 @@ struct EvalNDCG : public EvalRankList{ double sumdcg = 0.0; for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) { const unsigned rel = rec[i].second; - if (rel != 0) { + if (rel != 0) { sumdcg += ((1 << rel) - 1) / std::log(i + 2.0); } } return static_cast(sumdcg); } - virtual float EvalMetric(std::vector< std::pair > &rec) const { + virtual float EvalMetric(std::vector< std::pair > &rec) const { // NOLINT(*) std::stable_sort(rec.begin(), rec.end(), CmpFirst); float dcg = this->CalcDCG(rec); std::stable_sort(rec.begin(), rec.end(), CmpSecond); diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index 85358e72e..a98c47495 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_LEARNER_EVALUATION_H_ -#define XGBOOST_LEARNER_EVALUATION_H_ /*! + * Copyright 2014 by Contributors * \file evaluation.h * \brief interface of evaluation function supported in xgboost * \author Tianqi Chen, Kailong Chen */ +#ifndef XGBOOST_LEARNER_EVALUATION_H_ +#define XGBOOST_LEARNER_EVALUATION_H_ + #include #include #include @@ -19,7 +21,7 @@ struct IEvaluator{ * \brief evaluate a specific metric * \param preds prediction * \param info information, including label etc. - * \param distributed whether a call to Allreduce is needed to gather + * \param distributed whether a call to Allreduce is needed to gather * the average statistics across all the node, * this is only supported by some metrics */ diff --git a/src/learner/helper_utils.h b/src/learner/helper_utils.h index d318cf8bd..7ca7ba59c 100644 --- a/src/learner/helper_utils.h +++ b/src/learner/helper_utils.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_ -#define XGBOOST_LEARNER_HELPER_UTILS_H_ /*! + * Copyright 2014 by Contributors * \file helper_utils.h * \brief useful helper functions * \author Tianqi Chen, Kailong Chen */ +#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_ +#define XGBOOST_LEARNER_HELPER_UTILS_H_ + #include #include #include @@ -61,7 +63,7 @@ inline float LogSum(const float *rec, size_t size) { for (size_t i = 0; i < size; ++i) { sum += std::exp(rec[i] - mx); } - return mx + std::log(sum); + return mx + std::log(sum); } inline static bool CmpFirst(const std::pair &a, diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 45e312aa7..f051992d3 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_ -#define XGBOOST_LEARNER_LEARNER_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file learner-inl.hpp - * \brief learning algorithm + * \brief learning algorithm * \author Tianqi Chen */ +#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_ +#define XGBOOST_LEARNER_LEARNER_INL_HPP_ + #include #include #include @@ -19,7 +21,7 @@ namespace xgboost { /*! \brief namespace for learning algorithm */ namespace learner { -/*! +/*! * \brief learner that takes do gradient boosting on specific objective functions * and do training and prediction */ @@ -30,7 +32,7 @@ class BoostLearner : public rabit::Serializable { gbm_ = NULL; name_obj_ = "reg:linear"; name_gbm_ = "gbtree"; - silent= 0; + silent = 0; prob_buffer_row = 1.0f; distributed_mode = 0; updater_mode = 0; @@ -47,10 +49,10 @@ class BoostLearner : public rabit::Serializable { * \brief add internal cache space for mat, this can speedup prediction for matrix, * please cache prediction for training and eval data * warning: if the model is loaded from file from some previous training history - * set cache data must be called with exactly SAME + * set cache data must be called with exactly SAME * data matrices to continue training otherwise it will cause error * \param mats array of pointers to matrix whose prediction result need to be cached - */ + */ inline void SetCacheData(const std::vector& mats) { utils::Assert(cache_.size() == 0, "can only call cache data once"); // assign buffer index @@ -67,10 +69,10 @@ class BoostLearner : public rabit::Serializable { buffer_size += mats[i]->info.num_row(); } char str_temp[25]; - utils::SPrintf(str_temp, sizeof(str_temp), "%lu", - static_cast(buffer_size)); + utils::SPrintf(str_temp, sizeof(str_temp), "%lu", + static_cast(buffer_size)); // NOLINT(*) this->SetParam("num_pbuffer", str_temp); - this->pred_buffer_size = buffer_size; + this->pred_buffer_size = buffer_size; } /*! * \brief set parameters from outside @@ -79,7 +81,7 @@ class BoostLearner : public rabit::Serializable { */ inline void SetParam(const char *name, const char *val) { using namespace std; - // in this version, bst: prefix is no longer required + // in this version, bst: prefix is no longer required if (strncmp(name, "bst:", 4) != 0) { std::string n = "bst:"; n += name; this->SetParam(n.c_str(), val); @@ -119,7 +121,7 @@ class BoostLearner : public rabit::Serializable { if (!strcmp(name, "objective")) name_obj_ = val; if (!strcmp(name, "booster")) name_gbm_ = val; mparam.SetParam(name, val); - } + } if (gbm_ != NULL) gbm_->SetParam(name, val); if (obj_ != NULL) obj_->SetParam(name, val); if (gbm_ == NULL || obj_ == NULL) { @@ -133,16 +135,16 @@ class BoostLearner : public rabit::Serializable { // estimate feature bound unsigned num_feature = 0; for (size_t i = 0; i < cache_.size(); ++i) { - num_feature = std::max(num_feature, + num_feature = std::max(num_feature, static_cast(cache_[i].mat_->info.num_col())); } // run allreduce on num_feature to find the maximum value rabit::Allreduce(&num_feature, 1); if (num_feature > mparam.num_feature) mparam.num_feature = num_feature; - } + } char str_temp[25]; utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature); - this->SetParam("bst:num_feature", str_temp); + this->SetParam("bst:num_feature", str_temp); } /*! * \brief initialize the model @@ -161,13 +163,13 @@ class BoostLearner : public rabit::Serializable { * \param fi input stream * \param calc_num_feature whether call InitTrainer with calc_num_feature */ - inline void LoadModel(utils::IStream &fi, + inline void LoadModel(utils::IStream &fi, // NOLINT(*) bool calc_num_feature = true) { utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0, "BoostLearner: wrong model format"); { // backward compatibility code for compatible with old model type - // for new model, Read(&name_obj_) is suffice + // for new model, Read(&name_obj_) is suffice uint64_t len; utils::Check(fi.Read(&len, sizeof(len)) != 0, "BoostLearner: wrong model format"); if (len >= std::numeric_limits::max()) { @@ -226,9 +228,9 @@ class BoostLearner : public rabit::Serializable { fi = utils::IStream::Create(fname, "r"); this->LoadModel(*fi, true); } - delete fi; + delete fi; } - inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const { + inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*) ModelParam p = mparam; p.saved_with_pbuffer = static_cast(with_pbuffer); fo.Write(&p, sizeof(ModelParam)); @@ -247,7 +249,7 @@ class BoostLearner : public rabit::Serializable { fo->Write("bs64\t", 5); utils::Base64OutStream bout(fo); this->SaveModel(bout, with_pbuffer); - bout.Finish('\n'); + bout.Finish('\n'); } else { fo->Write("binf", 4); this->SaveModel(*fo, with_pbuffer); @@ -260,7 +262,7 @@ class BoostLearner : public rabit::Serializable { * \param p_train pointer to the matrix used by training */ inline void CheckInit(DMatrix *p_train) { - int ncol = static_cast(p_train->info.info.num_col); + int ncol = static_cast(p_train->info.info.num_col); std::vector enabled(ncol, true); // set max row per batch to limited value // in distributed mode, use safe choice otherwise @@ -345,10 +347,9 @@ class BoostLearner : public rabit::Serializable { bool output_margin, std::vector *out_preds, unsigned ntree_limit = 0, - bool pred_leaf = false - ) const { + bool pred_leaf = false) const { if (pred_leaf) { - gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit); + gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit); } else { this->PredictRaw(data, out_preds, ntree_limit); if (!output_margin) { @@ -361,7 +362,7 @@ class BoostLearner : public rabit::Serializable { * NOTE: use the batch prediction interface if possible, batch prediction is usually * more efficient than online prediction * This function is NOT threadsafe, make sure you only call from one thread - * + * * \param inst the instance you want to predict * \param output_margin whether to only predict margin value instead of transformed prediction * \param out_preds output vector to hold the predictions @@ -387,8 +388,8 @@ class BoostLearner : public rabit::Serializable { } protected: - /*! - * \brief initialize the objective function and GBM, + /*! + * \brief initialize the objective function and GBM, * if not yet done */ inline void InitObjGBM(void) { @@ -401,12 +402,12 @@ class BoostLearner : public rabit::Serializable { for (size_t i = 0; i < cfg_.size(); ++i) { obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); - } + } if (evaluator_.Size() == 0) { evaluator_.AddEval(obj_->DefaultEvalMetric()); } } - /*! + /*! * \brief additional default value for specific objs */ inline void InitAdditionDefaultParam(void) { @@ -415,12 +416,12 @@ class BoostLearner : public rabit::Serializable { gbm_->SetParam("max_delta_step", "0.7"); } } - /*! + /*! * \brief get un-transformed prediction * \param data training data matrix * \param out_preds output vector that stores the prediction * \param ntree_limit limit number of trees used for boosted tree - * predictor, when it equals 0, this means we are using all the trees + * predictor, when it equals 0, this means we are using all the trees */ inline void PredictRaw(const DMatrix &data, std::vector *out_preds, @@ -517,7 +518,7 @@ class BoostLearner : public rabit::Serializable { protected: // magic number to transform random seed - const static int kRandSeedMagic = 127; + static const int kRandSeedMagic = 127; // cache entry object that helps handle feature caching struct CacheEntry { const DMatrix *mat_; diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index d0ecf7a27..b6d388e3c 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ -#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file objective-inl.hpp * \brief objective function implementations * \author Tianqi Chen, Kailong Chen */ +#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ +#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ + #include #include #include @@ -176,14 +178,14 @@ class RegLossObj : public IObjFunction { // poisson regression for count class PoissonRegression : public IObjFunction { public: - explicit PoissonRegression(void) { + PoissonRegression(void) { max_delta_step = 0.0f; } virtual ~PoissonRegression(void) {} - + virtual void SetParam(const char *name, const char *val) { using namespace std; - if (!strcmp( "max_delta_step", name )) { + if (!strcmp("max_delta_step", name)) { max_delta_step = static_cast(atof(val)); } } @@ -201,9 +203,9 @@ class PoissonRegression : public IObjFunction { // check if label in range bool label_correct = true; // start calculating gradient - const long ndata = static_cast(preds.size()); + const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) - for (long i = 0; i < ndata; ++i) { + for (long i = 0; i < ndata; ++i) { // NOLINT(*) float p = preds[i]; float w = info.GetWeight(i); float y = info.labels[i]; @@ -219,9 +221,9 @@ class PoissonRegression : public IObjFunction { } virtual void PredTransform(std::vector *io_preds) { std::vector &preds = *io_preds; - const long ndata = static_cast(preds.size()); + const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) - for (long j = 0; j < ndata; ++j) { + for (long j = 0; j < ndata; ++j) { // NOLINT(*) preds[j] = std::exp(preds[j]); } } @@ -234,7 +236,7 @@ class PoissonRegression : public IObjFunction { virtual const char* DefaultEvalMetric(void) const { return "poisson-nloglik"; } - + private: float max_delta_step; }; @@ -467,7 +469,7 @@ class LambdaRankObj : public IObjFunction { : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {} }; /*! - * \brief get lambda weight for existing pairs + * \brief get lambda weight for existing pairs * \param list a list that is sorted by pred score * \param io_pairs record of pairs, containing the pairs to fill in weights */ @@ -555,10 +557,10 @@ class LambdaRankObjMAP : public LambdaRankObj { float ap_acc; /*! * \brief the accumulated precision, - * assuming a positive instance is missing + * assuming a positive instance is missing */ float ap_acc_miss; - /*! + /*! * \brief the accumulated precision, * assuming that one more positive instance is inserted ahead */ diff --git a/src/learner/objective.h b/src/learner/objective.h index c0a525a43..08b57f528 100644 --- a/src/learner/objective.h +++ b/src/learner/objective.h @@ -1,11 +1,14 @@ -#ifndef XGBOOST_LEARNER_OBJECTIVE_H_ -#define XGBOOST_LEARNER_OBJECTIVE_H_ /*! + * Copyright 2014 by Contributors * \file objective.h * \brief interface of objective function used for gradient boosting * \author Tianqi Chen, Kailong Chen */ -#include "dmatrix.h" +#ifndef XGBOOST_LEARNER_OBJECTIVE_H_ +#define XGBOOST_LEARNER_OBJECTIVE_H_ + +#include +#include "./dmatrix.h" namespace xgboost { namespace learner { @@ -13,13 +16,13 @@ namespace learner { class IObjFunction{ public: /*! \brief virtual destructor */ - virtual ~IObjFunction(void){} + virtual ~IObjFunction(void) {} /*! * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - virtual void SetParam(const char *name, const char *val) = 0; + virtual void SetParam(const char *name, const char *val) = 0; /*! * \brief get gradient over each of predictions, given existing information * \param preds prediction of current round @@ -38,9 +41,9 @@ class IObjFunction{ * \brief transform prediction values, this is only called when Prediction is called * \param io_preds prediction values, saves to this vector as well */ - virtual void PredTransform(std::vector *io_preds){} + virtual void PredTransform(std::vector *io_preds) {} /*! - * \brief transform prediction values, this is only called when Eval is called, + * \brief transform prediction values, this is only called when Eval is called, * usually it redirect to PredTransform * \param io_preds prediction values, saves to this vector as well */ @@ -49,7 +52,7 @@ class IObjFunction{ } /*! * \brief transform probability value back to margin - * this is used to transform user-set base_score back to margin + * this is used to transform user-set base_score back to margin * used by gradient boosting * \return transformed value */ @@ -77,7 +80,7 @@ inline IObjFunction* CreateObjFunction(const char *name) { if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1); if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj(); if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG(); - if (!strcmp("rank:map", name)) return new LambdaRankObjMAP(); + if (!strcmp("rank:map", name)) return new LambdaRankObjMAP(); utils::Error("unknown objective function type: %s", name); return NULL; } diff --git a/src/sync/sync.h b/src/sync/sync.h index 3a371b03c..b9bdf89fe 100644 --- a/src/sync/sync.h +++ b/src/sync/sync.h @@ -1,13 +1,13 @@ -#ifndef XGBOOST_SYNC_H_ -#define XGBOOST_SYNC_H_ /*! + * Copyright 2014 by Contributors * \file sync.h * \brief the synchronization module of rabit * redirects to subtree rabit header * \author Tianqi Chen */ +#ifndef XGBOOST_SYNC_SYNC_H_ +#define XGBOOST_SYNC_SYNC_H_ + #include "../../subtree/rabit/include/rabit.h" #include "../../subtree/rabit/include/rabit/timer.h" -#endif // XGBOOST_SYNC_H_ - - +#endif // XGBOOST_SYNC_SYNC_H_ diff --git a/src/tree/model.h b/src/tree/model.h index 4eea34911..6a22aa5f1 100644 --- a/src/tree/model.h +++ b/src/tree/model.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_MODEL_H_ -#define XGBOOST_TREE_MODEL_H_ /*! + * Copyright 2014 by Contributors * \file model.h * \brief model structure for tree * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_MODEL_H_ +#define XGBOOST_TREE_MODEL_H_ + #include #include #include @@ -19,7 +21,7 @@ namespace xgboost { namespace tree { /*! - * \brief template class of TreeModel + * \brief template class of TreeModel * \tparam TSplitCond data type to indicate split condition * \tparam TNodeStat auxiliary statistics of node to help tree building */ @@ -42,7 +44,7 @@ class TreeModel { int max_depth; /*! \brief number of features used for tree construction */ int num_feature; - /*! + /*! * \brief leaf vector size, used for vector tree * used to store more than one dimensional information in tree */ @@ -55,8 +57,8 @@ class TreeModel { size_leaf_vector = 0; std::memset(reserved, 0, sizeof(reserved)); } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ @@ -70,7 +72,7 @@ class TreeModel { /*! \brief tree node */ class Node { public: - Node(void) : sindex_(0) {} + Node(void) : sindex_(0) {} /*! \brief index of left child */ inline int cleft(void) const { return this->cleft_; @@ -119,15 +121,15 @@ class TreeModel { inline bool is_root(void) const { return parent_ == -1; } - /*! - * \brief set the right child + /*! + * \brief set the right child * \param nide node id to right child */ inline void set_right_child(int nid) { this->cright_ = nid; } - /*! - * \brief set split condition of current node + /*! + * \brief set split condition of current node * \param split_index feature index to split * \param split_cond split condition * \param default_left the default direction when feature is unknown @@ -138,10 +140,10 @@ class TreeModel { this->sindex_ = split_index; (this->info_).split_cond = split_cond; } - /*! + /*! * \brief set the leaf value of the node * \param value leaf value - * \param right right index, could be used to store + * \param right right index, could be used to store * additional information */ inline void set_leaf(float value, int right = -1) { @@ -153,12 +155,12 @@ class TreeModel { inline void mark_delete(void) { this->sindex_ = std::numeric_limits::max(); } - + private: friend class TreeModel; - /*! - * \brief in leaf node, we have weights, in non-leaf nodes, - * we have split condition + /*! + * \brief in leaf node, we have weights, in non-leaf nodes, + * we have split condition */ union Info{ float leaf_value; @@ -203,7 +205,7 @@ class TreeModel { "number of nodes in the tree exceed 2^31"); nodes.resize(param.num_nodes); stats.resize(param.num_nodes); - leaf_vector.resize(param.num_nodes * param.size_leaf_vector); + leaf_vector.resize(param.num_nodes * param.size_leaf_vector); return nd; } // delete a tree node, keep the parent field to allow trace back @@ -215,7 +217,7 @@ class TreeModel { } public: - /*! + /*! * \brief change a non leaf node to a leaf node, delete its children * \param rid node id of the node * \param new leaf value @@ -229,7 +231,7 @@ class TreeModel { this->DeleteNode(nodes[rid].cright()); nodes[rid].set_leaf(value); } - /*! + /*! * \brief collapse a non leaf node to a leaf node, delete its children * \param rid node id of the node * \param new leaf value @@ -273,7 +275,7 @@ class TreeModel { return &leaf_vector[nid * param.size_leaf_vector]; } /*! \brief get leaf vector given nid */ - inline const bst_float* leafvec(int nid) const{ + inline const bst_float* leafvec(int nid) const { if (leaf_vector.size() == 0) return NULL; return &leaf_vector[nid * param.size_leaf_vector]; } @@ -288,15 +290,15 @@ class TreeModel { nodes[i].set_parent(-1); } } - /*! + /*! * \brief load model from stream * \param fi input stream */ - inline void LoadModel(utils::IStream &fi) { + inline void LoadModel(utils::IStream &fi) { // NOLINT(*) utils::Check(fi.Read(¶m, sizeof(Param)) > 0, "TreeModel: wrong format"); nodes.resize(param.num_nodes); stats.resize(param.num_nodes); - utils::Assert(param.num_nodes != 0, "invalid model"); + utils::Assert(param.num_nodes != 0, "invalid model"); utils::Check(fi.Read(BeginPtr(nodes), sizeof(Node) * nodes.size()) > 0, "TreeModel: wrong format"); utils::Check(fi.Read(BeginPtr(stats), sizeof(NodeStat) * stats.size()) > 0, @@ -313,22 +315,22 @@ class TreeModel { "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d", param.num_deleted, deleted_nodes.size(), param.num_nodes); } - /*! + /*! * \brief save model to stream * \param fo output stream */ - inline void SaveModel(utils::IStream &fo) const { + inline void SaveModel(utils::IStream &fo) const { // NOLINT(*) utils::Assert(param.num_nodes == static_cast(nodes.size()), "Tree::SaveModel"); utils::Assert(param.num_nodes == static_cast(stats.size()), "Tree::SaveModel"); fo.Write(¶m, sizeof(Param)); - utils::Assert(param.num_nodes != 0, "invalid model"); + utils::Assert(param.num_nodes != 0, "invalid model"); fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size()); fo.Write(BeginPtr(stats), sizeof(NodeStat) * nodes.size()); if (param.size_leaf_vector != 0) fo.Write(leaf_vector); } - /*! + /*! * \brief add child nodes to node * \param nid node id to add childs */ @@ -340,8 +342,8 @@ class TreeModel { nodes[nodes[nid].cleft() ].set_parent(nid, true); nodes[nodes[nid].cright()].set_parent(nid, false); } - /*! - * \brief only add a right child to a leaf node + /*! + * \brief only add a right child to a leaf node * \param node id to add right child */ inline void AddRightChild(int nid) { @@ -385,7 +387,7 @@ class TreeModel { inline int num_extra_nodes(void) const { return param.num_nodes - param.num_roots - param.num_deleted; } - /*! + /*! * \brief dump model to text string * \param fmap feature map of feature types * \param with_stats whether dump out statistics as well @@ -400,7 +402,7 @@ class TreeModel { } private: - void Dump(int nid, std::stringstream &fo, + void Dump(int nid, std::stringstream &fo, // NOLINT(*) const utils::FeatMap& fmap, int depth, bool with_stats) { for (int i = 0; i < depth; ++i) { fo << '\t'; @@ -469,7 +471,7 @@ struct RTreeNodeStat { /*! \brief number of child that is leaf node known up to now */ int leaf_child_cnt; /*! \brief print information of current stats to fo */ - inline void Print(std::stringstream &fo, bool is_leaf) const { + inline void Print(std::stringstream &fo, bool is_leaf) const { // NOLINT(*) if (!is_leaf) { fo << ",gain=" << loss_chg << ",cover=" << sum_hess; } else { @@ -481,13 +483,13 @@ struct RTreeNodeStat { /*! \brief define regression tree to be the most common tree model */ class RegTree: public TreeModel{ public: - /*! + /*! * \brief dense feature vector that can be taken by RegTree * to do tranverse efficiently * and can be construct from sparse feature vector */ struct FVec { - /*! + /*! * \brief a union value of value and flag * when flag == -1, this indicate the value is missing */ @@ -510,7 +512,7 @@ class RegTree: public TreeModel{ } } /*! \brief drop the trace after fill, must be called after fill */ - inline void Drop(const RowBatch::Inst &inst) { + inline void Drop(const RowBatch::Inst &inst) { for (bst_uint i = 0; i < inst.length; ++i) { if (inst[i].index >= data.size()) continue; data[inst[i].index].flag = -1; @@ -526,10 +528,10 @@ class RegTree: public TreeModel{ } }; /*! - * \brief get the leaf index + * \brief get the leaf index * \param feats dense feature vector, if the feature is missing the field is set to NaN * \param root_gid starting root index of the instance - * \return the leaf index of the given feature + * \return the leaf index of the given feature */ inline int GetLeafIndex(const FVec&feat, unsigned root_id = 0) const { // start from groups that belongs to current data @@ -545,7 +547,7 @@ class RegTree: public TreeModel{ * \brief get the prediction of regression tree, only accepts dense feature vector * \param feats dense feature vector, if the feature is missing the field is set to NaN * \param root_gid starting root index of the instance - * \return the leaf index of the given feature + * \return the leaf index of the given feature */ inline float Predict(const FVec &feat, unsigned root_id = 0) const { int pid = this->GetLeafIndex(feat, root_id); diff --git a/src/tree/param.h b/src/tree/param.h index 20ba1e6c0..f06365a17 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -1,10 +1,13 @@ -#ifndef XGBOOST_TREE_PARAM_H_ -#define XGBOOST_TREE_PARAM_H_ /*! + * Copyright 2014 by Contributors * \file param.h * \brief training parameters, statistics used to support tree construction * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_PARAM_H_ +#define XGBOOST_TREE_PARAM_H_ + +#include #include #include "../data.h" @@ -27,7 +30,7 @@ struct TrainParam{ // L1 regularization factor float reg_alpha; // default direction choice - int default_direction; + int default_direction; // maximum delta update we can add in weight estimation // this parameter can be used to stablize update // default=0 means no constraint on weight delta @@ -45,7 +48,7 @@ struct TrainParam{ // accuracy of sketch float sketch_ratio; // leaf vector size - int size_leaf_vector; + int size_leaf_vector; // option for parallelization int parallel_option; // option to open cacheline optimizaton @@ -74,11 +77,11 @@ struct TrainParam{ sketch_ratio = 2.0f; cache_opt = 1; } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter - */ + */ inline void SetParam(const char *name, const char *val) { using namespace std; // sync-names @@ -116,7 +119,7 @@ struct TrainParam{ if (reg_alpha == 0.0f) { return Sqr(sum_grad) / (sum_hess + reg_lambda); } else { - return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda); + return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda); } } else { double w = CalcWeight(sum_grad, sum_hess); @@ -213,7 +216,7 @@ struct GradStats { inline static void CheckInfo(const BoosterInfo &info) { } /*! - * \brief accumulate statistics + * \brief accumulate statistics * \param p the gradient pair */ inline void Add(bst_gpair p) { @@ -222,7 +225,7 @@ struct GradStats { /*! * \brief accumulate statistics, more complicated version * \param gpair the vector storing the gradient statistics - * \param info the additional information + * \param info the additional information * \param ridx instance index of this instance */ inline void Add(const std::vector &gpair, @@ -244,7 +247,7 @@ struct GradStats { this->Add(b.sum_grad, b.sum_hess); } /*! \brief same as add, reduce is used in All Reduce */ - inline static void Reduce(GradStats &a, const GradStats &b) { + inline static void Reduce(GradStats &a, const GradStats &b) { // NOLINT(*) a.Add(b); } /*! \brief set current value to a - b */ @@ -257,8 +260,8 @@ struct GradStats { return sum_hess == 0.0; } /*! \brief set leaf vector value based on statistics */ - inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{ - } + inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const { + } // constructor to allow inheritance GradStats(void) {} /*! \brief add statistics to the data */ @@ -311,7 +314,7 @@ struct CVGradStats : public GradStats { ret += param.CalcGain(train[i].sum_grad, train[i].sum_hess, vsize * valid[i].sum_grad, - vsize * valid[i].sum_hess); + vsize * valid[i].sum_hess); } return ret / vsize; } @@ -324,7 +327,7 @@ struct CVGradStats : public GradStats { } } /*! \brief same as add, reduce is used in All Reduce */ - inline static void Reduce(CVGradStats &a, const CVGradStats &b) { + inline static void Reduce(CVGradStats &a, const CVGradStats &b) { // NOLINT(*) a.Add(b); } /*! \brief set current value to a - b */ @@ -344,8 +347,8 @@ struct CVGradStats : public GradStats { } }; -/*! - * \brief statistics that is helpful to store +/*! + * \brief statistics that is helpful to store * and represent a split solution for the tree */ struct SplitEntry{ @@ -357,12 +360,12 @@ struct SplitEntry{ float split_value; /*! \brief constructor */ SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {} - /*! - * \brief decides whether a we can replace current entry with the statistics given + /*! + * \brief decides whether a we can replace current entry with the statistics given * This function gives better priority to lower index when loss_chg equals * not the best way, but helps to give consistent result during multi-thread execution * \param loss_chg the loss reduction get through the split - * \param split_index the feature index where the split is on + * \param split_index the feature index where the split is on */ inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const { if (this->split_index() <= split_index) { @@ -371,7 +374,7 @@ struct SplitEntry{ return !(this->loss_chg > new_loss_chg); } } - /*! + /*! * \brief update the split entry, replace it if e is better * \param e candidate split solution * \return whether the proposed split is better and can replace current split @@ -386,7 +389,7 @@ struct SplitEntry{ return false; } } - /*! + /*! * \brief update the split entry, replace it if e is better * \param loss_chg loss reduction of new candidate * \param split_index feature index to split on @@ -407,7 +410,7 @@ struct SplitEntry{ } } /*! \brief same as update, used by AllReduce*/ - inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { + inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { // NOLINT(*) dst.Update(src); } /*!\return feature index to split on */ diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp index 5d2e99820..eb2e06925 100644 --- a/src/tree/updater.cpp +++ b/src/tree/updater.cpp @@ -1,3 +1,4 @@ +// Copyright 2014 by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX diff --git a/src/tree/updater.h b/src/tree/updater.h index 4ced21e5e..1cf74a699 100644 --- a/src/tree/updater.h +++ b/src/tree/updater.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_UPDATER_H_ -#define XGBOOST_TREE_UPDATER_H_ /*! + * Copyright 2014 by Contributors * \file updater.h * \brief interface to update the tree * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_H_ +#define XGBOOST_TREE_UPDATER_H_ + #include #include "../data.h" @@ -12,7 +14,7 @@ namespace xgboost { namespace tree { -/*! +/*! * \brief interface of tree update module, that performs update of a tree */ class IUpdater { @@ -21,7 +23,7 @@ class IUpdater { * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter - */ + */ virtual void SetParam(const char *name, const char *val) = 0; /*! * \brief peform update to the tree models @@ -29,8 +31,8 @@ class IUpdater { * \param p_fmat feature matrix that provide access to features * \param info extra side information that may be need, such as root index * \param trees pointer to the trees to be updated, upater will change the content of the tree - * note: all the trees in the vector are updated, with the same statistics, - * but maybe different random seeds, usually one tree is passed in at a time, + * note: all the trees in the vector are updated, with the same statistics, + * but maybe different random seeds, usually one tree is passed in at a time, * there can be multiple trees when we train random forest style model */ virtual void Update(const std::vector &gpair, @@ -38,7 +40,7 @@ class IUpdater { const BoosterInfo &info, const std::vector &trees) = 0; - /*! + /*! * \brief this is simply a function for optimizing performance * this function asks the updater to return the leaf position of each instance in the p_fmat, * if it is cached in the updater, if it is not available, return NULL @@ -50,8 +52,8 @@ class IUpdater { // destructor virtual ~IUpdater(void) {} }; -/*! - * \brief create a updater based on name +/*! + * \brief create a updater based on name * \param name name of updater * \return return the updater instance */ diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp index f144ae199..6204c47b7 100644 --- a/src/tree/updater_basemaker-inl.hpp +++ b/src/tree/updater_basemaker-inl.hpp @@ -1,12 +1,14 @@ -#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ -#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_basemaker-inl.hpp * \brief implement a common tree constructor * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ +#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ #include #include +#include #include #include "../sync/sync.h" #include "../utils/random.h" @@ -14,7 +16,7 @@ namespace xgboost { namespace tree { -/*! +/*! * \brief base tree maker class that defines common operation * needed in tree making */ @@ -26,7 +28,7 @@ class BaseMaker: public IUpdater { virtual void SetParam(const char *name, const char *val) { param.SetParam(name, val); } - + protected: // helper to collect and query feature meta information struct FMetaHelper { @@ -60,8 +62,11 @@ class BaseMaker: public IUpdater { bst_float a = fminmax[fid * 2]; bst_float b = fminmax[fid * 2 + 1]; if (a == -std::numeric_limits::max()) return 0; - if (-a == b) return 1; - else return 2; + if (-a == b) { + return 1; + } else { + return 2; + } } inline bst_float MaxValue(bst_uint fid) const { return fminmax[fid *2 + 1]; @@ -70,7 +75,7 @@ class BaseMaker: public IUpdater { std::vector &findex = *p_findex; findex.clear(); for (size_t i = 0; i < fminmax.size(); i += 2) { - const bst_uint fid = static_cast(i / 2); + const bst_uint fid = static_cast(i / 2); if (this->Type(fid) != 0) findex.push_back(fid); } unsigned n = static_cast(p * findex.size()); @@ -86,7 +91,7 @@ class BaseMaker: public IUpdater { rabit::Broadcast(&s_cache, 0); fs.Read(&findex); } - + private: std::vector fminmax; }; @@ -116,7 +121,7 @@ class BaseMaker: public IUpdater { } return nthread; } - // ------class member helpers--------- + // ------class member helpers--------- /*! \brief initialize temp data structure */ inline void InitData(const std::vector &gpair, const IFMatrix &fmat, @@ -124,7 +129,8 @@ class BaseMaker: public IUpdater { const RegTree &tree) { utils::Assert(tree.param.num_nodes == tree.param.num_roots, "TreeMaker: can only grow new tree"); - {// setup position + { + // setup position position.resize(gpair.size()); if (root_index.size() == 0) { std::fill(position.begin(), position.end(), 0); @@ -147,7 +153,8 @@ class BaseMaker: public IUpdater { } } } - {// expand query + { + // expand query qexpand.reserve(256); qexpand.clear(); for (int i = 0; i < tree.param.num_roots; ++i) { qexpand.push_back(i); @@ -170,7 +177,7 @@ class BaseMaker: public IUpdater { this->UpdateNode2WorkIndex(tree); } // return decoded position - inline int DecodePosition(bst_uint ridx) const{ + inline int DecodePosition(bst_uint ridx) const { const int pid = position[ridx]; return pid < 0 ? ~pid : pid; } @@ -182,23 +189,24 @@ class BaseMaker: public IUpdater { position[ridx] = nid; } } - /*! + /*! * \brief this is helper function uses column based data structure, * reset the positions to the lastest one * \param nodes the set of nodes that contains the split to be used * \param p_fmat feature matrix needed for tree construction * \param tree the regression tree structure */ - inline void ResetPositionCol(const std::vector &nodes, IFMatrix *p_fmat, const RegTree &tree) { + inline void ResetPositionCol(const std::vector &nodes, + IFMatrix *p_fmat, const RegTree &tree) { // set the positions in the nondefault this->SetNonDefaultPositionCol(nodes, p_fmat, tree); // set rest of instances to default position const std::vector &rowset = p_fmat->buffered_rowset(); // set default direct nodes to default - // for leaf nodes that are not fresh, mark then to ~nid, + // for leaf nodes that are not fresh, mark then to ~nid, // so that they are ignored in future statistics collection const bst_omp_uint ndata = static_cast(rowset.size()); - + #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < ndata; ++i) { const bst_uint ridx = rowset[i]; @@ -237,7 +245,7 @@ class BaseMaker: public IUpdater { } std::sort(fsplits.begin(), fsplits.end()); fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin()); - + utils::IIterator *iter = p_fmat->ColIterator(fsplits); while (iter->Next()) { const ColBatch &batch = iter->Value(); @@ -252,7 +260,7 @@ class BaseMaker: public IUpdater { const int nid = this->DecodePosition(ridx); // go back to parent, correct those who are not default if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) { - if(fvalue < tree[nid].split_cond()) { + if (fvalue < tree[nid].split_cond()) { this->SetEncodePosition(ridx, tree[nid].cleft()); } else { this->SetEncodePosition(ridx, tree[nid].cright()); @@ -324,7 +332,7 @@ class BaseMaker: public IUpdater { sketch->temp.size = 0; } /*! - * \brief push a new element to sketch + * \brief push a new element to sketch * \param fvalue feature value, comes in sorted ascending order * \param w weight * \param max_size @@ -337,31 +345,32 @@ class BaseMaker: public IUpdater { return; } if (last_fvalue != fvalue) { - double rmax = rmin + wmin; + double rmax = rmin + wmin; if (rmax >= next_goal && sketch->temp.size != max_size) { - if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { + if (sketch->temp.size == 0 || + last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { // push to sketch sketch->temp.data[sketch->temp.size] = utils::WXQuantileSketch:: Entry(static_cast(rmin), - static_cast(rmax), - static_cast(wmin), last_fvalue); + static_cast(rmax), + static_cast(wmin), last_fvalue); utils::Assert(sketch->temp.size < max_size, "invalid maximum size max_size=%u, stemp.size=%lu\n", max_size, sketch->temp.size); ++sketch->temp.size; } if (sketch->temp.size == max_size) { - next_goal = sum_total * 2.0f + 1e-5f; - } else{ + next_goal = sum_total * 2.0f + 1e-5f; + } else { next_goal = static_cast(sketch->temp.size * sum_total / max_size); } } else { - if (rmax >= next_goal) { - rabit::TrackerPrintf("INFO: rmax=%g, sum_total=%g, next_goal=%g, size=%lu\n", - rmax, sum_total, next_goal, sketch->temp.size); - } - } + if (rmax >= next_goal) { + rabit::TrackerPrintf("INFO: rmax=%g, sum_total=%g, next_goal=%g, size=%lu\n", + rmax, sum_total, next_goal, sketch->temp.size); + } + } rmin = rmax; wmin = w; last_fvalue = fvalue; @@ -375,13 +384,13 @@ class BaseMaker: public IUpdater { if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { utils::Assert(sketch->temp.size <= max_size, "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu", - sketch->temp.size, max_size ); + sketch->temp.size, max_size); // push to sketch sketch->temp.data[sketch->temp.size] = utils::WXQuantileSketch:: Entry(static_cast(rmin), - static_cast(rmax), - static_cast(wmin), last_fvalue); + static_cast(rmax), + static_cast(wmin), last_fvalue); ++sketch->temp.size; } sketch->PushTemp(); @@ -415,4 +424,4 @@ class BaseMaker: public IUpdater { }; } // namespace tree } // namespace xgboost -#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ +#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_ diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index db3581aac..e3070d495 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_ -#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_colmaker-inl.hpp * \brief use columnwise update to construct a tree * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_ +#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_ + #include #include #include @@ -114,10 +116,13 @@ class ColMaker: public IUpdater { // initialize temp data structure inline void InitData(const std::vector &gpair, const IFMatrix &fmat, - const std::vector &root_index, const RegTree &tree) { - utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); + const std::vector &root_index, + const RegTree &tree) { + utils::Assert(tree.param.num_nodes == tree.param.num_roots, + "ColMaker: can only grow new tree"); const std::vector &rowset = fmat.buffered_rowset(); - {// setup position + { + // setup position position.resize(gpair.size()); if (root_index.size() == 0) { for (size_t i = 0; i < rowset.size(); ++i) { @@ -127,7 +132,8 @@ class ColMaker: public IUpdater { for (size_t i = 0; i < rowset.size(); ++i) { const bst_uint ridx = rowset[i]; position[ridx] = root_index[ridx]; - utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots, "root index exceed setting"); + utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots, + "root index exceed setting"); } } // mark delete for the deleted datas @@ -154,11 +160,12 @@ class ColMaker: public IUpdater { } unsigned n = static_cast(param.colsample_bytree * feat_index.size()); random::Shuffle(feat_index); - //utils::Check(n > 0, "colsample_bytree is too small that no feature can be included"); - utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", param.colsample_bytree); + utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", + param.colsample_bytree); feat_index.resize(n); } - {// setup temp space for each thread + { + // setup temp space for each thread #pragma omp parallel { this->nthread = omp_get_num_threads(); @@ -171,20 +178,25 @@ class ColMaker: public IUpdater { } snode.reserve(256); } - {// expand query + { + // expand query qexpand_.reserve(256); qexpand_.clear(); for (int i = 0; i < tree.param.num_roots; ++i) { qexpand_.push_back(i); } } } - /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ + /*! + * \brief initialize the base_weight, root_gain, + * and NodeEntry for all the new nodes in qexpand + */ inline void InitNewNode(const std::vector &qexpand, const std::vector &gpair, const IFMatrix &fmat, const BoosterInfo &info, const RegTree &tree) { - {// setup statistics space for each tree node + { + // setup statistics space for each tree node for (size_t i = 0; i < stemp.size(); ++i) { stemp[i].resize(tree.param.num_nodes, ThreadEntry(param)); } @@ -226,7 +238,7 @@ class ColMaker: public IUpdater { } // use new nodes for qexpand qexpand = newnodes; - } + } // parallel find the best split of current fid // this function does not support nested functions inline void ParallelFindSplit(const ColBatch::Inst &col, @@ -280,26 +292,30 @@ class ColMaker: public IUpdater { ThreadEntry &e = stemp[tid][nid]; float fsplit; if (tid != 0) { - if(std::abs(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) { + if (std::abs(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) { fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f; } else { continue; } } else { fsplit = e.first_fvalue - rt_eps; - } + } if (need_forward && tid != 0) { c.SetSubstract(snode[nid].stats, e.stats); - if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + if (c.sum_hess >= param.min_child_weight && + e.stats.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); e.best.Update(loss_chg, fid, fsplit, false); } } if (need_backward) { tmp.SetSubstract(sum, e.stats); c.SetSubstract(snode[nid].stats, tmp); - if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + if (c.sum_hess >= param.min_child_weight && + tmp.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(tmp.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); e.best.Update(loss_chg, fid, fsplit, true); } } @@ -308,8 +324,10 @@ class ColMaker: public IUpdater { tmp = sum; ThreadEntry &e = stemp[nthread-1][nid]; c.SetSubstract(snode[nid].stats, tmp); - if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + if (c.sum_hess >= param.min_child_weight && + tmp.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(tmp.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true); } } @@ -335,25 +353,31 @@ class ColMaker: public IUpdater { e.first_fvalue = fvalue; } else { // forward default right - if (std::abs(fvalue - e.first_fvalue) > rt_2eps){ - if (need_forward) { + if (std::abs(fvalue - e.first_fvalue) > rt_2eps) { + if (need_forward) { c.SetSubstract(snode[nid].stats, e.stats); - if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + if (c.sum_hess >= param.min_child_weight && + e.stats.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - + snode[nid].root_gain); e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false); } } if (need_backward) { cright.SetSubstract(e.stats_extra, e.stats); c.SetSubstract(snode[nid].stats, cright); - if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + if (c.sum_hess >= param.min_child_weight && + cright.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(cright.CalcGain(param) + + c.CalcGain(param) - + snode[nid].root_gain); e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true); } } - } + } e.stats.Add(gpair, info, ridx); - e.first_fvalue = fvalue; + e.first_fvalue = fvalue; } } } @@ -361,7 +385,7 @@ class ColMaker: public IUpdater { // update enumeration solution inline void UpdateEnumeration(int nid, bst_gpair gstats, float fvalue, int d_step, bst_uint fid, - TStats &c, std::vector &temp) { + TStats &c, std::vector &temp) { // NOLINT(*) // get the statistics of nid ThreadEntry &e = temp[nid]; // test if first hit, this is fine, because we set 0 during init @@ -370,10 +394,12 @@ class ColMaker: public IUpdater { e.last_fvalue = fvalue; } else { // try to find a split - if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { + if (std::abs(fvalue - e.last_fvalue) > rt_2eps && + e.stats.sum_hess >= param.min_child_weight) { c.SetSubstract(snode[nid].stats, e.stats); if (c.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1); } } @@ -388,7 +414,7 @@ class ColMaker: public IUpdater { int d_step, bst_uint fid, const std::vector &gpair, - std::vector &temp) { + std::vector &temp) { // NOLINT(*) const std::vector &qexpand = qexpand_; // clear all the temp statistics for (size_t j = 0; j < qexpand.size(); ++j) { @@ -423,7 +449,7 @@ class ColMaker: public IUpdater { this->UpdateEnumeration(nid, buf_gpair[i], p->fvalue, d_step, fid, c, temp); - } + } } // finish up the ending piece for (it = align_end, i = 0; it != end; ++i, it += d_step) { @@ -436,14 +462,15 @@ class ColMaker: public IUpdater { this->UpdateEnumeration(nid, buf_gpair[i], it->fvalue, d_step, fid, c, temp); - } + } // finish updating all statistics, check if it is possible to include all sum statistics for (size_t i = 0; i < qexpand.size(); ++i) { const int nid = qexpand[i]; ThreadEntry &e = temp[nid]; c.SetSubstract(snode[nid].stats, e.stats); if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); const float gap = std::abs(e.last_fvalue) + rt_eps; const float delta = d_step == +1 ? gap: -gap; e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1); @@ -458,7 +485,7 @@ class ColMaker: public IUpdater { bst_uint fid, const std::vector &gpair, const BoosterInfo &info, - std::vector &temp) { + std::vector &temp) { // NOLINT(*) // use cacheline aware optimization if (TStats::kSimpleStats != 0 && param.cache_opt != 0) { EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp); @@ -471,7 +498,7 @@ class ColMaker: public IUpdater { } // left statistics TStats c(param); - for(const ColBatch::Entry *it = begin; it != end; it += d_step) { + for (const ColBatch::Entry *it = begin; it != end; it += d_step) { const bst_uint ridx = it->index; const int nid = position[ridx]; if (nid < 0) continue; @@ -485,10 +512,12 @@ class ColMaker: public IUpdater { e.last_fvalue = fvalue; } else { // try to find a split - if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { + if (std::abs(fvalue - e.last_fvalue) > rt_2eps && + e.stats.sum_hess >= param.min_child_weight) { c.SetSubstract(snode[nid].stats, e.stats); if (c.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1); } } @@ -503,7 +532,8 @@ class ColMaker: public IUpdater { ThreadEntry &e = temp[nid]; c.SetSubstract(snode[nid].stats, e.stats); if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { - bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + + c.CalcGain(param) - snode[nid].root_gain); const float gap = std::abs(e.last_fvalue) + rt_eps; const float delta = d_step == +1 ? gap: -gap; e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1); @@ -511,14 +541,14 @@ class ColMaker: public IUpdater { } } - // update the solution candidate + // update the solution candidate virtual void UpdateSolution(const ColBatch &batch, const std::vector &gpair, const IFMatrix &fmat, const BoosterInfo &info) { // start enumeration const bst_omp_uint nsize = static_cast(batch.size); - #if defined(_OPENMP) + #if defined(_OPENMP) const int batch_size = std::max(static_cast(nsize / this->nthread / 32), 1); #endif int poption = param.parallel_option; @@ -533,11 +563,11 @@ class ColMaker: public IUpdater { const ColBatch::Inst c = batch[i]; const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue; if (param.need_forward_search(fmat.GetColDensity(fid), ind)) { - this->EnumerateSplit(c.data, c.data + c.length, +1, + this->EnumerateSplit(c.data, c.data + c.length, +1, fid, gpair, info, stemp[tid]); } if (param.need_backward_search(fmat.GetColDensity(fid), ind)) { - this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, + this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, fid, gpair, info, stemp[tid]); } } @@ -546,7 +576,7 @@ class ColMaker: public IUpdater { this->ParallelFindSplit(batch[i], batch.col_index[i], fmat, gpair, info); } - } + } } // find splits at current level, do split per level inline void FindSplit(int depth, @@ -571,7 +601,7 @@ class ColMaker: public IUpdater { // get the best result, we can synchronize the solution for (size_t i = 0; i < qexpand.size(); ++i) { const int nid = qexpand[i]; - NodeEntry &e = snode[nid]; + NodeEntry &e = snode[nid]; // now we know the solution in snode[nid], set split if (e.best.loss_chg > rt_eps) { p_tree->AddChilds(nid); @@ -582,19 +612,20 @@ class ColMaker: public IUpdater { } else { (*p_tree)[nid].set_leaf(e.weight * param.learning_rate); } - } + } } // reset position of each data points after split is created in the tree - inline void ResetPosition(const std::vector &qexpand, IFMatrix *p_fmat, const RegTree &tree) { + inline void ResetPosition(const std::vector &qexpand, + IFMatrix *p_fmat, const RegTree &tree) { // set the positions in the nondefault - this->SetNonDefaultPosition(qexpand, p_fmat, tree); + this->SetNonDefaultPosition(qexpand, p_fmat, tree); // set rest of instances to default position const std::vector &rowset = p_fmat->buffered_rowset(); // set default direct nodes to default - // for leaf nodes that are not fresh, mark then to ~nid, + // for leaf nodes that are not fresh, mark then to ~nid, // so that they are ignored in future statistics collection const bst_omp_uint ndata = static_cast(rowset.size()); - + #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < ndata; ++i) { const bst_uint ridx = rowset[i]; @@ -655,7 +686,7 @@ class ColMaker: public IUpdater { const float fvalue = col[j].fvalue; // go back to parent, correct those who are not default if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) { - if(fvalue < tree[nid].split_cond()) { + if (fvalue < tree[nid].split_cond()) { this->SetEncodePosition(ridx, tree[nid].cleft()); } else { this->SetEncodePosition(ridx, tree[nid].cright()); @@ -667,7 +698,7 @@ class ColMaker: public IUpdater { } // utils to get/set position, with encoded format // return decoded position - inline int DecodePosition(bst_uint ridx) const{ + inline int DecodePosition(bst_uint ridx) const { const int pid = position[ridx]; return pid < 0 ? ~pid : pid; } @@ -679,7 +710,7 @@ class ColMaker: public IUpdater { position[ridx] = nid; } } - //--data fields-- + // --data fields-- const TrainParam ¶m; // number of omp thread used during training int nthread; diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp index c989f4e47..e3d3f8b59 100644 --- a/src/tree/updater_distcol-inl.hpp +++ b/src/tree/updater_distcol-inl.hpp @@ -1,11 +1,15 @@ -#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_ -#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_distcol-inl.hpp - * \brief beta distributed version that takes a sub-column + * \brief beta distributed version that takes a sub-column * and construct a tree * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_ +#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_ + +#include +#include #include "../sync/sync.h" #include "../utils/bitmap.h" #include "../utils/io.h" @@ -27,7 +31,7 @@ class DistColMaker : public ColMaker { virtual void Update(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, - const std::vector &trees) { + const std::vector &trees) { TStats::CheckInfo(info); utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time"); // build the tree @@ -39,11 +43,12 @@ class DistColMaker : public ColMaker { } virtual const int* GetLeafPosition(void) const { return builder.GetLeafPosition(); - } + } + private: struct Builder : public ColMaker::Builder { public: - Builder(const TrainParam ¶m) + explicit Builder(const TrainParam ¶m) : ColMaker::Builder(param) { } inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) { @@ -63,7 +68,8 @@ class DistColMaker : public ColMaker { virtual const int* GetLeafPosition(void) const { return BeginPtr(this->position); } - protected: + + protected: virtual void SetNonDefaultPosition(const std::vector &qexpand, IFMatrix *p_fmat, const RegTree &tree) { // step 2, classify the non-default data into right places @@ -87,7 +93,7 @@ class DistColMaker : public ColMaker { #pragma omp parallel for schedule(static) for (bst_omp_uint j = 0; j < ndata; ++j) { boolmap[j] = 0; - } + } } utils::IIterator *iter = p_fmat->ColIterator(fsplits); while (iter->Next()) { @@ -111,7 +117,7 @@ class DistColMaker : public ColMaker { } } } - + bitmap.InitFromBool(boolmap); // communicate bitmap rabit::Allreduce(BeginPtr(bitmap.data), bitmap.data.size()); @@ -142,7 +148,7 @@ class DistColMaker : public ColMaker { } vec.push_back(this->snode[nid].best); } - // TODO, lazy version + // TODO(tqchen) lazy version // communicate best solution reducer.Allreduce(BeginPtr(vec), vec.size()); // assign solution back @@ -151,7 +157,7 @@ class DistColMaker : public ColMaker { this->snode[nid].best = vec[i]; } } - + private: utils::BitMap bitmap; std::vector boolmap; @@ -162,8 +168,8 @@ class DistColMaker : public ColMaker { // training parameter TrainParam param; // pointer to the builder - Builder builder; + Builder builder; }; } // namespace tree } // namespace xgboost -#endif +#endif // XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_ diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index f739f23f3..d86204e4b 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_ -#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_histmaker-inl.hpp * \brief use histogram counting to construct a tree * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_ +#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_ + #include #include #include "../sync/sync.h" @@ -38,7 +40,7 @@ class HistMaker: public BaseMaker { struct HistUnit { /*! \brief cutting point of histogram, contains maximum point */ const bst_float *cut; - /*! \brief content of statistics data */ + /*! \brief content of statistics data */ TStats *data; /*! \brief size of histogram */ unsigned size; @@ -48,13 +50,13 @@ class HistMaker: public BaseMaker { HistUnit(const bst_float *cut, TStats *data, unsigned size) : cut(cut), data(data), size(size) {} /*! \brief add a histogram to data */ - inline void Add(bst_float fv, + inline void Add(bst_float fv, const std::vector &gpair, const BoosterInfo &info, const bst_uint ridx) { unsigned i = std::upper_bound(cut, cut + size, fv) - cut; utils::Assert(size != 0, "try insert into size=0"); - utils::Assert(i < size, + utils::Assert(i < size, "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]); data[i].Add(gpair, info, ridx); } @@ -74,7 +76,7 @@ class HistMaker: public BaseMaker { rptr[fid+1] - rptr[fid]); } }; - // thread workspace + // thread workspace struct ThreadWSpace { /*! \brief actual unit pointer */ std::vector rptr; @@ -92,7 +94,7 @@ class HistMaker: public BaseMaker { } hset[tid].rptr = BeginPtr(rptr); hset[tid].cut = BeginPtr(cut); - hset[tid].data.resize(cut.size(), TStats(param)); + hset[tid].data.resize(cut.size(), TStats(param)); } } // aggregate all statistics to hset[0] @@ -147,7 +149,7 @@ class HistMaker: public BaseMaker { } // this function does two jobs // (1) reset the position in array position, to be the latest leaf id - // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly + // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly virtual void ResetPosAndPropose(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, @@ -171,8 +173,9 @@ class HistMaker: public BaseMaker { const BoosterInfo &info, const std::vector &fset, const RegTree &tree) = 0; + private: - inline void EnumerateSplit(const HistUnit &hist, + inline void EnumerateSplit(const HistUnit &hist, const TStats &node_sum, bst_uint fid, SplitEntry *best, @@ -187,7 +190,7 @@ class HistMaker: public BaseMaker { c.SetSubstract(node_sum, s); if (c.sum_hess >= param.min_child_weight) { double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; - if (best->Update((float)loss_chg, fid, hist.cut[i], false)) { + if (best->Update(static_cast(loss_chg), fid, hist.cut[i], false)) { *left_sum = s; } } @@ -200,7 +203,7 @@ class HistMaker: public BaseMaker { c.SetSubstract(node_sum, s); if (c.sum_hess >= param.min_child_weight) { double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; - if (best->Update((float)loss_chg, fid, hist.cut[i-1], true)) { + if (best->Update(static_cast(loss_chg), fid, hist.cut[i-1], true)) { *left_sum = c; } } @@ -216,22 +219,22 @@ class HistMaker: public BaseMaker { const size_t num_feature = fset.size(); // get the best split condition for each node std::vector sol(qexpand.size()); - std::vector left_sum(qexpand.size()); + std::vector left_sum(qexpand.size()); bst_omp_uint nexpand = static_cast(qexpand.size()); #pragma omp parallel for schedule(dynamic, 1) - for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) { + for (bst_omp_uint wid = 0; wid < nexpand; ++wid) { const int nid = qexpand[wid]; utils::Assert(node2workindex[nid] == static_cast(wid), "node2workindex inconsistent"); SplitEntry &best = sol[wid]; TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0]; - for (size_t i = 0; i < fset.size(); ++ i) { + for (size_t i = 0; i < fset.size(); ++i) { EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)], node_sum, fset[i], &best, &left_sum[wid]); } } // get the best result, we can synchronize the solution - for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) { + for (bst_omp_uint wid = 0; wid < nexpand; ++wid) { const int nid = qexpand[wid]; const SplitEntry &best = sol[wid]; const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0]; @@ -244,7 +247,7 @@ class HistMaker: public BaseMaker { (*p_tree)[nid].set_split(best.split_index(), best.split_value, best.default_left()); // mark right child as 0, to indicate fresh leaf - (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0); + (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0); (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0); // right side sum TStats right_sum; @@ -256,11 +259,11 @@ class HistMaker: public BaseMaker { } } } - + inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) { p_tree->stat(nid).base_weight = static_cast(node_sum.CalcWeight(param)); p_tree->stat(nid).sum_hess = static_cast(node_sum.sum_hess); - node_sum.SetLeafVec(param, p_tree->leafvec(nid)); + node_sum.SetLeafVec(param, p_tree->leafvec(nid)); } }; @@ -270,7 +273,7 @@ class CQHistMaker: public HistMaker { struct HistEntry { typename HistMaker::HistUnit hist; unsigned istart; - /*! + /*! * \brief add a histogram to data, * do linear scan, start from istart */ @@ -282,7 +285,7 @@ class CQHistMaker: public HistMaker { utils::Assert(istart != hist.size, "the bound variable must be max"); hist.data[istart].Add(gpair, info, ridx); } - /*! + /*! * \brief add a histogram to data, * do linear scan, start from istart */ @@ -302,7 +305,7 @@ class CQHistMaker: public HistMaker { feat_helper.InitByCol(p_fmat, tree); feat_helper.SampleCol(this->param.colsample_bytree, p_fset); } - // code to create histogram + // code to create histogram virtual void CreateHist(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, @@ -313,7 +316,7 @@ class CQHistMaker: public HistMaker { std::fill(feat2workindex.begin(), feat2workindex.end(), -1); for (size_t i = 0; i < fset.size(); ++i) { feat2workindex[fset[i]] = static_cast(i); - } + } // start to work this->wspace.Init(this->param, 1); // if it is C++11, use lazy evaluation for Allreduce, @@ -350,11 +353,11 @@ class CQHistMaker: public HistMaker { // sync the histogram // if it is C++11, use lazy evaluation for Allreduce #if __cplusplus >= 201103L - this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), + this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size(), lazy_get_hist); #else - this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size()); -#endif + this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size()); +#endif } virtual void ResetPositionAfterSplit(IFMatrix *p_fmat, const RegTree &tree) { @@ -374,11 +377,11 @@ class CQHistMaker: public HistMaker { feat2workindex[fset[i]] = static_cast(freal_set.size()); freal_set.push_back(fset[i]); } else { - feat2workindex[fset[i]] = -2; + feat2workindex[fset[i]] = -2; } } this->GetNodeStats(gpair, *p_fmat, tree, info, - &thread_stats, &node_stats); + &thread_stats, &node_stats); sketchs.resize(this->qexpand.size() * freal_set.size()); for (size_t i = 0; i < sketchs.size(); ++i) { sketchs[i].Init(info.num_row, this->param.sketch_eps); @@ -394,7 +397,8 @@ class CQHistMaker: public HistMaker { #if __cplusplus >= 201103L auto lazy_get_summary = [&]() #endif - {// get smmary + { + // get smmary thread_sketch.resize(this->get_nthread()); // number of rows in const size_t nrows = p_fmat->buffered_rowset().size(); @@ -457,9 +461,9 @@ class CQHistMaker: public HistMaker { this->wspace.rptr.push_back(static_cast(this->wspace.cut.size())); } else { utils::Assert(offset == -2, "BUG in mark"); - bst_float cpt = feat_helper.MaxValue(fset[i]); + bst_float cpt = feat_helper.MaxValue(fset[i]); this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps); - this->wspace.rptr.push_back(static_cast(this->wspace.cut.size())); + this->wspace.rptr.push_back(static_cast(this->wspace.cut.size())); } } // reserve last value for global statistics @@ -470,7 +474,7 @@ class CQHistMaker: public HistMaker { (fset.size() + 1) * this->qexpand.size() + 1, "cut space inconsistent"); } - + private: inline void UpdateHistCol(const std::vector &gpair, const ColBatch::Inst &c, @@ -554,9 +558,9 @@ class CQHistMaker: public HistMaker { } } else { for (size_t i = 0; i < this->qexpand.size(); ++i) { - const unsigned nid = this->qexpand[i]; + const unsigned nid = this->qexpand[i]; sbuilder[nid].sum_total = static_cast(nstats[nid].sum_hess); - } + } } // if only one value, no need to do second pass if (c[0].fvalue == c[c.length-1].fvalue) { @@ -589,7 +593,7 @@ class CQHistMaker: public HistMaker { if (nid >= 0) { sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size); } - } + } } for (bst_uint j = align_length; j < c.length; ++j) { const bst_uint ridx = c[j].index; @@ -617,7 +621,7 @@ class CQHistMaker: public HistMaker { // temp space to map feature id to working index std::vector feat2workindex; // set of index from fset that are real - std::vector freal_set; + std::vector freal_set; // thread temp data std::vector< std::vector > thread_sketch; // used to hold statistics @@ -631,18 +635,18 @@ class CQHistMaker: public HistMaker { // reducer for summary rabit::SerializeReducer sreducer; // per node, per feature sketch - std::vector< utils::WXQuantileSketch > sketchs; + std::vector< utils::WXQuantileSketch > sketchs; }; template -class QuantileHistMaker: public HistMaker { +class QuantileHistMaker: public HistMaker { protected: typedef utils::WXQuantileSketch WXQSketch; virtual void ResetPosAndPropose(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, const std::vector &fset, - const RegTree &tree) { + const RegTree &tree) { // initialize the data structure int nthread = BaseMaker::get_nthread(); sketchs.resize(this->qexpand.size() * tree.param.num_feature); @@ -658,7 +662,7 @@ class QuantileHistMaker: public HistMaker { utils::ParallelGroupBuilder builder(&col_ptr, &col_data, &thread_col_ptr); builder.InitBudget(tree.param.num_feature, nthread); - const bst_omp_uint nbatch = static_cast(batch.size); + const bst_omp_uint nbatch = static_cast(batch.size); #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < nbatch; ++i) { RowBatch::Inst inst = batch[i]; @@ -667,11 +671,11 @@ class QuantileHistMaker: public HistMaker { if (nid >= 0) { if (!tree[nid].is_leaf()) { this->position[ridx] = nid = HistMaker::NextLevel(inst, tree, nid); - } + } if (this->node2workindex[nid] < 0) { this->position[ridx] = ~nid; - } else{ - for (bst_uint j = 0; j < inst.length; ++j) { + } else { + for (bst_uint j = 0; j < inst.length; ++j) { builder.AddBudget(inst[j].index, omp_get_thread_num()); } } @@ -712,8 +716,8 @@ class QuantileHistMaker: public HistMaker { summary_array[i].Reserve(max_size); summary_array[i].SetPrune(out, max_size); } - - size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); + + size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size()); // now we get the final result of sketch, setup the cut this->wspace.cut.clear(); diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp index e7e5f9f0b..dc99e94e4 100644 --- a/src/tree/updater_prune-inl.hpp +++ b/src/tree/updater_prune-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ -#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_prune-inl.hpp - * \brief prune a tree given the statistics + * \brief prune a tree given the statistics * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ +#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ + #include #include "./param.h" #include "./updater.h" @@ -37,9 +39,10 @@ class TreePruner: public IUpdater { param.learning_rate = lr; syncher.Update(gpair, p_fmat, info, trees); } + private: // try to prune off current leaf - inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { + inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*) if (tree[nid].is_root()) return npruned; int pid = tree[nid].parent(); RegTree::NodeStat &s = tree.stat(pid); @@ -51,10 +54,10 @@ class TreePruner: public IUpdater { return this->TryPruneLeaf(tree, pid, depth - 1, npruned+2); } else { return npruned; - } + } } /*! \brief do prunning of a tree */ - inline void DoPrune(RegTree &tree) { + inline void DoPrune(RegTree &tree) { // NOLINT(*) int npruned = 0; // initialize auxiliary statistics for (int nid = 0; nid < tree.param.num_nodes; ++nid) { diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp index 8613c8ea6..b6c5ee89e 100644 --- a/src/tree/updater_refresh-inl.hpp +++ b/src/tree/updater_refresh-inl.hpp @@ -1,10 +1,12 @@ -#ifndef XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_ -#define XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_refresh-inl.hpp * \brief refresh the statistics and leaf value on the tree on the dataset * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_ +#define XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_ + #include #include #include "../sync/sync.h" @@ -27,7 +29,7 @@ class TreeRefresher: public IUpdater { virtual void Update(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, - const std::vector &trees) { + const std::vector &trees) { if (trees.size() == 0) return; // number of threads // thread temporal space @@ -100,7 +102,7 @@ class TreeRefresher: public IUpdater { float lr = param.learning_rate; param.learning_rate = lr / trees.size(); int offset = 0; - for (size_t i = 0; i < trees.size(); ++i) { + for (size_t i = 0; i < trees.size(); ++i) { for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) { this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]); } @@ -147,7 +149,7 @@ class TreeRefresher: public IUpdater { // training parameter TrainParam param; // reducer - rabit::Reducer reducer; + rabit::Reducer reducer; }; } // namespace tree diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp index 6bc2fc39a..ade22011b 100644 --- a/src/tree/updater_skmaker-inl.hpp +++ b/src/tree/updater_skmaker-inl.hpp @@ -1,11 +1,13 @@ -#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_ -#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_skmaker-inl.hpp * \brief use approximation sketch to construct a tree, a refresh is needed to make the statistics exactly correct * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_ +#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_ + #include #include #include "../sync/sync.h" @@ -30,7 +32,7 @@ class SketchMaker: public BaseMaker { } param.learning_rate = lr; } - + protected: inline void Update(const std::vector &gpair, IFMatrix *p_fmat, @@ -79,9 +81,9 @@ class SketchMaker: public BaseMaker { double pos_grad; /*! \brief sum of all negative gradient */ double neg_grad; - /*! \brief sum of hessian statistics */ + /*! \brief sum of hessian statistics */ double sum_hess; - explicit SKStats(void) {} + SKStats(void) {} // constructor explicit SKStats(const TrainParam ¶m) { this->Clear(); @@ -123,7 +125,7 @@ class SketchMaker: public BaseMaker { sum_hess += b.sum_hess; } /*! \brief same as add, reduce is used in All Reduce */ - inline static void Reduce(SKStats &a, const SKStats &b) { + inline static void Reduce(SKStats &a, const SKStats &b) { // NOLINT(*) a.Add(b); } /*! \brief set leaf vector value based on statistics */ @@ -139,7 +141,7 @@ class SketchMaker: public BaseMaker { sketchs[i].Init(info.num_row, this->param.sketch_eps); } thread_sketch.resize(this->get_nthread()); - // number of rows in + // number of rows in const size_t nrows = p_fmat->buffered_rowset().size(); // start accumulating statistics utils::IIterator *iter = p_fmat->ColIterator(); @@ -156,7 +158,7 @@ class SketchMaker: public BaseMaker { batch[i].length == nrows, &thread_sketch[omp_get_thread_num()]); } - } + } // setup maximum size unsigned max_size = param.max_sketch_size(); // synchronize sketch @@ -167,8 +169,8 @@ class SketchMaker: public BaseMaker { summary_array[i].Reserve(max_size); summary_array[i].SetPrune(out, max_size); } - size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); - sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size()); + size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); + sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size()); } // update sketch information in column fid inline void UpdateSketchCol(const std::vector &gpair, @@ -209,7 +211,7 @@ class SketchMaker: public BaseMaker { const unsigned nid = this->qexpand[i]; sbuilder[3 * nid + 0].sum_total = static_cast(nstats[nid].pos_grad); sbuilder[3 * nid + 1].sum_total = static_cast(nstats[nid].neg_grad); - sbuilder[3 * nid + 2].sum_total = static_cast(nstats[nid].sum_hess); + sbuilder[3 * nid + 2].sum_total = static_cast(nstats[nid].sum_hess); } } // if only one value, no need to do second pass @@ -217,7 +219,9 @@ class SketchMaker: public BaseMaker { for (size_t i = 0; i < this->qexpand.size(); ++i) { const int nid = this->qexpand[i]; for (int k = 0; k < 3; ++k) { - sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, static_cast(sbuilder[3 * nid + k].sum_total)); + sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, + static_cast( + sbuilder[3 * nid + k].sum_total)); } } return; @@ -250,7 +254,7 @@ class SketchMaker: public BaseMaker { sbuilder[3 * nid + k].Finalize(max_size); } } - } + } inline void SyncNodeStats(void) { utils::Assert(qexpand.size() != 0, "qexpand must not be empty"); std::vector tmp(qexpand.size()); @@ -272,12 +276,12 @@ class SketchMaker: public BaseMaker { std::vector sol(qexpand.size()); bst_omp_uint nexpand = static_cast(qexpand.size()); #pragma omp parallel for schedule(dynamic, 1) - for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) { + for (bst_omp_uint wid = 0; wid < nexpand; ++wid) { const int nid = qexpand[wid]; utils::Assert(node2workindex[nid] == static_cast(wid), "node2workindex inconsistent"); SplitEntry &best = sol[wid]; - for (bst_uint fid = 0; fid < num_feature; ++ fid) { + for (bst_uint fid = 0; fid < num_feature; ++fid) { unsigned base = (wid * p_tree->param.num_feature + fid) * 3; EnumerateSplit(summary_array[base + 0], summary_array[base + 1], @@ -286,7 +290,7 @@ class SketchMaker: public BaseMaker { } } // get the best result, we can synchronize the solution - for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) { + for (bst_omp_uint wid = 0; wid < nexpand; ++wid) { const int nid = qexpand[wid]; const SplitEntry &best = sol[wid]; // set up the values @@ -337,7 +341,7 @@ class SketchMaker: public BaseMaker { feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax; feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax; size_t ipos = 0, ineg = 0, ihess = 0; - for (size_t i = 1; i < fsplits.size(); ++i) { + for (size_t i = 1; i < fsplits.size(); ++i) { WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos); WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg); WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess); @@ -345,11 +349,11 @@ class SketchMaker: public BaseMaker { s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin); s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin); s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin); - c.SetSubstract(node_sum, s); + c.SetSubstract(node_sum, s); // forward if (s.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { - double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; + double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; best->Update(static_cast(loss_chg), fid, fsplits[i], false); } // backward @@ -357,22 +361,23 @@ class SketchMaker: public BaseMaker { s.SetSubstract(node_sum, c); if (s.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { - double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; + double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; best->Update(static_cast(loss_chg), fid, fsplits[i], true); - } + } } - {// all including + { + // all including SKStats s = feat_sum, c; c.SetSubstract(node_sum, s); if (s.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { bst_float cpt = fsplits.back(); - double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; + double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain; best->Update(static_cast(loss_chg), fid, cpt + fabsf(cpt) + 1.0f, false); } } } - + // thread temp data // used to hold temporal sketch std::vector< std::vector > thread_sketch; @@ -389,6 +394,6 @@ class SketchMaker: public BaseMaker { // per node, per feature sketch std::vector< utils::WXQuantileSketch > sketchs; }; -} // tree -} // xgboost -#endif +} // namespace tree +} // namespace xgboost +#endif // XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_ diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp index 2aa534aa8..e76d1f76d 100644 --- a/src/tree/updater_sync-inl.hpp +++ b/src/tree/updater_sync-inl.hpp @@ -1,18 +1,21 @@ -#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_ -#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_ /*! + * Copyright 2014 by Contributors * \file updater_sync-inl.hpp * \brief synchronize the tree in all distributed nodes * \author Tianqi Chen */ +#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_ +#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_ + #include +#include #include #include "../sync/sync.h" #include "./updater.h" namespace xgboost { namespace tree { -/*! +/*! * \brief syncher that synchronize the tree in all distributed nodes * can implement various strategies, so far it is always set to node 0's tree */ @@ -28,7 +31,7 @@ class TreeSyncher: public IUpdater { const std::vector &trees) { this->SyncTrees(trees); } - + private: // synchronize the trees in different nodes, take tree from rank 0 inline void SyncTrees(const std::vector &trees) { @@ -43,7 +46,7 @@ class TreeSyncher: public IUpdater { } fs.Seek(0); rabit::Broadcast(&s_model, 0); - for (size_t i = 0; i < trees.size(); ++i) { + for (size_t i = 0; i < trees.size(); ++i) { trees[i]->LoadModel(fs); } } diff --git a/src/utils/base64-inl.h b/src/utils/base64-inl.h index 9fd5fc49f..49cd65254 100644 --- a/src/utils/base64-inl.h +++ b/src/utils/base64-inl.h @@ -1,13 +1,16 @@ -#ifndef XGBOOST_UTILS_BASE64_INL_H_ -#define XGBOOST_UTILS_BASE64_INL_H_ /*! + * Copyright 2014 by Contributors * \file base64.h * \brief data stream support to input and output from/to base64 stream * base64 is easier to store and pass as text format in mapreduce * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_BASE64_INL_H_ +#define XGBOOST_UTILS_BASE64_INL_H_ + #include #include +#include #include "./io.h" namespace xgboost { @@ -15,7 +18,7 @@ namespace utils { /*! \brief buffer reader of the stream that allows you to get */ class StreamBufferReader { public: - StreamBufferReader(size_t buffer_size) + explicit StreamBufferReader(size_t buffer_size) :stream_(NULL), read_len_(1), read_ptr_(1) { buffer_.resize(buffer_size); @@ -45,7 +48,7 @@ class StreamBufferReader { inline bool AtEnd(void) const { return read_len_ == 0; } - + private: /*! \brief the underlying stream */ IStream *stream_; @@ -75,7 +78,7 @@ const char DecodeTable[] = { }; static const char EncodeTable[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -} // namespace base64 +} // namespace base64 /*! \brief the stream that reads from base64, note we take from file pointers */ class Base64InStream: public IStream { public: @@ -83,8 +86,8 @@ class Base64InStream: public IStream { reader_.set_stream(fs); num_prev = 0; tmp_ch = 0; } - /*! - * \brief initialize the stream position to beginning of next base64 stream + /*! + * \brief initialize the stream position to beginning of next base64 stream * call this function before actually start read */ inline void InitPosition(void) { @@ -132,19 +135,19 @@ class Base64InStream: public IStream { { // second byte utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)), - "invalid base64 format"); + "invalid base64 format"); nvalue |= DecodeTable[tmp_ch] << 12; *cptr++ = (nvalue >> 16) & 0xFF; --tlen; } { // third byte utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)), - "invalid base64 format"); + "invalid base64 format"); // handle termination if (tmp_ch == '=') { utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format"); utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)), - "invalid base64 format"); + "invalid base64 format"); break; } nvalue |= DecodeTable[tmp_ch] << 6; @@ -157,10 +160,10 @@ class Base64InStream: public IStream { { // fourth byte utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)), - "invalid base64 format"); + "invalid base64 format"); if (tmp_ch == '=') { utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)), - "invalid base64 format"); + "invalid base64 format"); break; } nvalue |= DecodeTable[tmp_ch]; @@ -240,13 +243,13 @@ class Base64OutStream: public IStream { if (endch != EOF) PutChar(endch); this->Flush(); } - - private: + + private: IStream *fp; int buf_top; unsigned char buf[4]; std::string out_buf; - const static size_t kBufferSize = 256; + static const size_t kBufferSize = 256; inline void PutChar(char ch) { out_buf += ch; @@ -260,5 +263,5 @@ class Base64OutStream: public IStream { } }; } // namespace utils -} // namespace rabit -#endif // RABIT_LEARN_UTILS_BASE64_INL_H_ +} // namespace xgboost +#endif // XGBOOST_UTILS_BASE64_INL_H_ diff --git a/src/utils/bitmap.h b/src/utils/bitmap.h index ba12caf41..eecccbda5 100644 --- a/src/utils/bitmap.h +++ b/src/utils/bitmap.h @@ -1,11 +1,13 @@ -#ifndef XGBOOST_UTILS_BITMAP_H_ -#define XGBOOST_UTILS_BITMAP_H_ /*! + * Copyright 2014 by Contributors * \file bitmap.h * \brief a simple implement of bitmap * NOTE: bitmap is only threadsafe per word access, remember this when using bitmap * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_BITMAP_H_ +#define XGBOOST_UTILS_BITMAP_H_ + #include #include "./utils.h" #include "./omp.h" @@ -16,22 +18,22 @@ namespace utils { struct BitMap { /*! \brief internal data structure */ std::vector data; - /*! - * \brief resize the bitmap to be certain size + /*! + * \brief resize the bitmap to be certain size * \param size the size of bitmap */ inline void Resize(size_t size) { data.resize((size + 31U) >> 5, 0); } - /*! - * \brief query the i-th position of bitmap - * \param i the position in + /*! + * \brief query the i-th position of bitmap + * \param i the position in */ inline bool Get(size_t i) const { return (data[i >> 5] >> (i & 31U)) & 1U; } - /*! - * \brief set i-th position to true + /*! + * \brief set i-th position to true * \param i position index */ inline void SetTrue(size_t i) { @@ -63,4 +65,4 @@ struct BitMap { }; } // namespace utils } // namespace xgboost -#endif +#endif // XGBOOST_UTILS_BITMAP_H_ diff --git a/src/utils/config.h b/src/utils/config.h index 19f4980cf..43d7bc8bd 100644 --- a/src/utils/config.h +++ b/src/utils/config.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_UTILS_CONFIG_H_ -#define XGBOOST_UTILS_CONFIG_H_ /*! + * Copyright 2014 by Contributors * \file config.h * \brief helper class to load in configures from file * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_CONFIG_H_ +#define XGBOOST_UTILS_CONFIG_H_ + #include #include #include @@ -14,26 +16,26 @@ namespace xgboost { namespace utils { -/*! +/*! * \brief base implementation of config reader */ class ConfigReaderBase { public: - /*! + /*! * \brief get current name, called after Next returns true - * \return current parameter name + * \return current parameter name */ inline const char *name(void) const { return s_name.c_str(); } - /*! + /*! * \brief get current value, called after Next returns true - * \return current parameter value + * \return current parameter value */ inline const char *val(void) const { return s_val.c_str(); } - /*! + /*! * \brief move iterator to next position * \return true if there is value in next position */ @@ -55,7 +57,7 @@ class ConfigReaderBase { protected: /*! * \brief to be implemented by subclass, - * get next token, return EOF if end of file + * get next token, return EOF if end of file */ virtual char GetChar(void) = 0; /*! \brief to be implemented by child, check if end of stream */ @@ -144,9 +146,9 @@ class ConfigReaderBase { */ class ConfigStreamReader: public ConfigReaderBase { public: - /*! - * \brief constructor - * \param istream input stream + /*! + * \brief constructor + * \param istream input stream */ explicit ConfigStreamReader(std::istream &fin) : fin(fin) {} @@ -163,13 +165,13 @@ class ConfigStreamReader: public ConfigReaderBase { std::istream &fin; }; -/*! +/*! * \brief an iterator that iterates over a configure file and gets the configures */ class ConfigIterator: public ConfigStreamReader { public: - /*! - * \brief constructor + /*! + * \brief constructor * \param fname name of configure file */ explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) { diff --git a/src/utils/fmap.h b/src/utils/fmap.h index 607f37013..218a61aa4 100644 --- a/src/utils/fmap.h +++ b/src/utils/fmap.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_UTILS_FMAP_H_ -#define XGBOOST_UTILS_FMAP_H_ /*! + * Copyright 2014 by Contributors * \file fmap.h * \brief helper class that holds the feature names and interpretations * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_FMAP_H_ +#define XGBOOST_UTILS_FMAP_H_ + #include #include #include @@ -78,4 +80,4 @@ class FeatMap { } // namespace utils } // namespace xgboost -#endif // XGBOOST_FMAP_H_ +#endif // XGBOOST_UTILS_FMAP_H_ diff --git a/src/utils/group_data.h b/src/utils/group_data.h index 6e12a39ff..31f9c3a50 100644 --- a/src/utils/group_data.h +++ b/src/utils/group_data.h @@ -1,6 +1,5 @@ -#ifndef XGBOOST_UTILS_GROUP_DATA_H_ -#define XGBOOST_UTILS_GROUP_DATA_H_ /*! + * Copyright 2014 by Contributors * \file group_data.h * \brief this file defines utils to group data by integer keys * Input: given input sequence (key,value), (k1,v1), (k2,v2) @@ -12,6 +11,11 @@ * The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_GROUP_DATA_H_ +#define XGBOOST_UTILS_GROUP_DATA_H_ + +#include + namespace xgboost { namespace utils { /*! @@ -32,10 +36,10 @@ struct ParallelGroupBuilder { std::vector< std::vector > *p_thread_rptr) : rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) { } - + public: /*! - * \brief step 1: initialize the helper, with hint of number keys + * \brief step 1: initialize the helper, with hint of number keys * and thread used in the construction * \param nkeys number of keys in the matrix, can be smaller than expected * \param nthread number of thread that will be used in construction @@ -56,7 +60,7 @@ struct ParallelGroupBuilder { inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) { std::vector &trptr = thread_rptr[threadid]; if (trptr.size() < key + 1) { - trptr.resize(key + 1, 0); + trptr.resize(key + 1, 0); } trptr[key] += nelem; } @@ -84,13 +88,13 @@ struct ParallelGroupBuilder { data.resize(start); } /*! - * \brief step 4: add data to the allocated space, + * \brief step 4: add data to the allocated space, * the calls to this function should be exactly match previous call to AddBudget * - * \param key the key of + * \param key the key of * \param threadid the id of thread that calls this function */ - inline void Push(size_t key, ValueType value, int threadid) { + inline void Push(size_t key, ValueType value, int threadid) { SizeType &rp = thread_rptr[threadid][key]; data[rp++] = value; } @@ -107,5 +111,4 @@ struct ParallelGroupBuilder { }; } // namespace utils } // namespace xgboost -#endif - +#endif // XGBOOST_UTILS_GROUP_DATA_H_ diff --git a/src/utils/io.h b/src/utils/io.h index d96d16e2a..5b366e51c 100644 --- a/src/utils/io.h +++ b/src/utils/io.h @@ -1,16 +1,19 @@ -#ifndef XGBOOST_UTILS_IO_H -#define XGBOOST_UTILS_IO_H +/*! + * Copyright 2014 by Contributors + * \file io.h + * \brief general stream interface for serialization, I/O + * \author Tianqi Chen + */ + +#ifndef XGBOOST_UTILS_IO_H_ +#define XGBOOST_UTILS_IO_H_ #include #include #include #include #include "./utils.h" #include "../sync/sync.h" -/*! - * \file io.h - * \brief general stream interface for serialization, I/O - * \author Tianqi Chen - */ + namespace xgboost { namespace utils { // reuse the definitions of streams @@ -23,7 +26,7 @@ typedef rabit::utils::MemoryBufferStream MemoryBufferStream; class FileStream : public ISeekStream { public: explicit FileStream(std::FILE *fp) : fp(fp) {} - explicit FileStream(void) { + FileStream(void) { this->fp = NULL; } virtual size_t Read(void *ptr, size_t size) { @@ -33,7 +36,7 @@ class FileStream : public ISeekStream { std::fwrite(ptr, size, 1, fp); } virtual void Seek(size_t pos) { - std::fseek(fp, static_cast(pos), SEEK_SET); + std::fseek(fp, static_cast(pos), SEEK_SET); // NOLINT(*) } virtual size_t Tell(void) { return std::ftell(fp); @@ -42,7 +45,7 @@ class FileStream : public ISeekStream { return std::feof(fp) != 0; } inline void Close(void) { - if (fp != NULL){ + if (fp != NULL) { std::fclose(fp); fp = NULL; } } @@ -52,6 +55,5 @@ class FileStream : public ISeekStream { }; } // namespace utils } // namespace xgboost - #include "./base64-inl.h" -#endif +#endif // XGBOOST_UTILS_IO_H_ diff --git a/src/utils/iterator.h b/src/utils/iterator.h index 3f5b23310..5d986b2e4 100644 --- a/src/utils/iterator.h +++ b/src/utils/iterator.h @@ -1,11 +1,13 @@ -#ifndef XGBOOST_UTILS_ITERATOR_H -#define XGBOOST_UTILS_ITERATOR_H -#include /*! + * Copyright 2014 by Contributors * \file iterator.h * \brief itertator interface * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_ITERATOR_H_ +#define XGBOOST_UTILS_ITERATOR_H_ +#include + namespace xgboost { namespace utils { /*! @@ -16,7 +18,7 @@ template class IIterator { public: /*! - * \brief set the parameter + * \brief set the parameter * \param name name of parameter * \param val value of parameter */ @@ -36,5 +38,5 @@ class IIterator { } // namespace utils } // namespace xgboost -#endif +#endif // XGBOOST_UTILS_ITERATOR_H_ diff --git a/src/utils/math.h b/src/utils/math.h index e0bf8c466..7609df076 100644 --- a/src/utils/math.h +++ b/src/utils/math.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_UTILS_MATH_H_ -#define XGBOOST_UTILS_MATH_H_ /*! + * Copyright 2014 by Contributors * \file math.h * \brief support additional math * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_MATH_H_ +#define XGBOOST_UTILS_MATH_H_ + #include namespace xgboost { @@ -28,7 +30,8 @@ inline T LogGamma(T v) { #if _MSC_VER >= 1800 return lgamma(v); #else -#pragma message ("Warning: lgamma function was not available until VS2013, poisson regression will be disabled") +#pragma message("Warning: lgamma function was not available until VS2013"\ + ", poisson regression will be disabled") utils::Error("lgamma function was not available until VS2013"); return static_cast(1.0); #endif diff --git a/src/utils/omp.h b/src/utils/omp.h index 87cad380e..ddd3467d9 100644 --- a/src/utils/omp.h +++ b/src/utils/omp.h @@ -1,16 +1,20 @@ -#ifndef XGBOOST_UTILS_OMP_H_ -#define XGBOOST_UTILS_OMP_H_ /*! + * Copyright 2014 by Contributors * \file omp.h * \brief header to handle OpenMP compatibility issues * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_OMP_H_ +#define XGBOOST_UTILS_OMP_H_ + #if defined(_OPENMP) #include #else #ifndef DISABLE_OPENMP // use pragma message instead of warning -#pragma message ("Warning: OpenMP is not available, xgboost will be compiled into single-thread code. Use OpenMP-enabled compiler to get benefit of multi-threading") +#pragma message("Warning: OpenMP is not available,"\ + "xgboost will be compiled into single-thread code."\ + "Use OpenMP-enabled compiler to get benefit of multi-threading") #endif inline int omp_get_thread_num() { return 0; } inline int omp_get_num_threads() { return 1; } @@ -25,6 +29,6 @@ typedef int bst_omp_uint; #else typedef unsigned bst_omp_uint; #endif -} // namespace xgboost +} // namespace xgboost #endif // XGBOOST_UTILS_OMP_H_ diff --git a/src/utils/quantile.h b/src/utils/quantile.h index 4e885e254..ffd9142da 100644 --- a/src/utils/quantile.h +++ b/src/utils/quantile.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_UTILS_QUANTILE_H_ -#define XGBOOST_UTILS_QUANTILE_H_ /*! + * Copyright 2014 by Contributors * \file quantile.h - * \brief util to compute quantiles + * \brief util to compute quantiles * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_QUANTILE_H_ +#define XGBOOST_UTILS_QUANTILE_H_ + #include #include #include @@ -37,8 +39,8 @@ struct WQSummary { // constructor Entry(RType rmin, RType rmax, RType wmin, DType value) : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {} - /*! - * \brief debug function, check Valid + /*! + * \brief debug function, check Valid * \param eps the tolerate level for violating the relation */ inline void CheckValid(RType eps = 0) const { @@ -65,7 +67,7 @@ struct WQSummary { // default constructor QEntry(void) {} // constructor - QEntry(DType value, RType weight) + QEntry(DType value, RType weight) : value(value), weight(weight) {} // comparator on value inline bool operator<(const QEntry &b) const { @@ -83,11 +85,11 @@ struct WQSummary { } else { queue[qtail - 1].weight += w; } - } + } inline void MakeSummary(WQSummary *out) { std::sort(queue.begin(), queue.begin() + qtail); out->size = 0; - // start update sketch + // start update sketch RType wsum = 0; // construct data with unique weights for (size_t i = 0; i < qtail;) { @@ -106,7 +108,7 @@ struct WQSummary { /*! \brief number of elements in the summary */ size_t size; // constructor - WQSummary(Entry *data, size_t size) + WQSummary(Entry *data, size_t size) : data(data), size(size) {} /*! * \return the maximum error of the Summary @@ -119,12 +121,12 @@ struct WQSummary { } return res; } - /*! + /*! * \brief query qvalue, start from istart * \param qvalue the value we query for * \param istart starting position */ - inline Entry Query(DType qvalue, size_t &istart) const { + inline Entry Query(DType qvalue, size_t &istart) const { // NOLINT(*) while (istart < size && qvalue > data[istart].value) { ++istart; } @@ -136,7 +138,7 @@ struct WQSummary { return data[istart]; } else { if (istart == 0) { - return Entry(0.0f, 0.0f, 0.0f, qvalue); + return Entry(0.0f, 0.0f, 0.0f, qvalue); } else { return Entry(data[istart - 1].rmin_next(), data[istart].rmax_prev(), @@ -154,12 +156,12 @@ struct WQSummary { */ inline void CopyFrom(const WQSummary &src) { size = src.size; - std::memcpy(data, src.data, sizeof(Entry) * size); - } - /*! - * \brief debug function, validate whether the summary + std::memcpy(data, src.data, sizeof(Entry) * size); + } + /*! + * \brief debug function, validate whether the summary * run consistency check to check if it is a valid summary - * \param eps the tolerate error level, used when RType is floating point and + * \param eps the tolerate error level, used when RType is floating point and * some inconsistency could occur due to rounding error */ inline void CheckValid(RType eps) const { @@ -199,8 +201,8 @@ struct WQSummary { size_t i = 1, lastidx = 0; for (size_t k = 1; k < n; ++k) { RType dx2 = 2 * ((k * range) / n + begin); - // find first i such that d < (rmax[i+1] + rmin[i+1]) / 2 - while (i < src.size - 1 + // find first i such that d < (rmax[i+1] + rmin[i+1]) / 2 + while (i < src.size - 1 && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i; utils::Assert(i != src.size - 1, "this cannot happen"); if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) { @@ -217,7 +219,7 @@ struct WQSummary { data[size++] = src.data[src.size - 1]; } } - /*! + /*! * \brief set current summary to be merged summary of sa and sb * \param sa first input summary to be merged * \param sb second input summar to be merged @@ -230,7 +232,7 @@ struct WQSummary { if (sb.size == 0) { this->CopyFrom(sa); return; } - utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); + utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); const Entry *a = sa.data, *a_end = sa.data + sa.size; const Entry *b = sb.data, *b_end = sb.data + sb.size; // extended rmin value @@ -297,7 +299,7 @@ struct WXQSummary : public WQSummary { RType begin = src.data[0].rmax; size_t n = maxsize - 1, nbig = 0; RType range = src.data[src.size - 1].rmin - begin; - // prune off zero weights + // prune off zero weights if (range == 0.0f) { // special case, contain only two effective data pts this->data[0] = src.data[0]; @@ -331,7 +333,7 @@ struct WXQSummary : public WQSummary { utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n); utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n", src.size, maxsize, static_cast(range), - static_cast(chunk)); + static_cast(chunk)); for (size_t i = 0; i < src.size; ++i) { utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i, src.data[i].rmin, src.data[i].rmax, src.data[i].wmin, @@ -352,7 +354,7 @@ struct WXQSummary : public WQSummary { RType maxdx2 = src.data[end].rmax_prev() * 2; for (; k < n; ++k) { RType dx2 = 2 * ((k * mrange) / n + begin); - if (dx2 >= maxdx2) break; + if (dx2 >= maxdx2) break; while (i < end && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i; if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) { @@ -371,13 +373,13 @@ struct WXQSummary : public WQSummary { lastidx = end; } bid = end; - // shift base by the gap + // shift base by the gap begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev(); } } } }; -/*! +/*! * \brief traditional GK summary */ template @@ -405,7 +407,7 @@ struct GKSummary { // push data to the queue inline void Push(DType x, RType w) { queue[qtail++] = x; - } + } inline void MakeSummary(GKSummary *out) { std::sort(queue.begin(), queue.begin() + qtail); out->size = qtail; @@ -419,7 +421,7 @@ struct GKSummary { /*! \brief number of elements in the summary */ size_t size; GKSummary(Entry *data, size_t size) - : data(data), size(size) {} + : data(data), size(size) {} /*! \brief the maximum error of the summary */ inline RType MaxError(void) const { RType res = 0; @@ -432,7 +434,7 @@ struct GKSummary { inline RType MaxRank(void) const { return data[size - 1].rmax; } - /*! + /*! * \brief copy content from src * \param src source sketch */ @@ -450,8 +452,8 @@ struct GKSummary { << "[" << data[i].rmin << "," << data[i].rmax << "]" << std::endl; } - } - /*! + } + /*! * \brief set current summary to be pruned summary of src * assume data field is already allocated to be at least maxsize * \param src source summary @@ -486,8 +488,8 @@ struct GKSummary { } if (sb.size == 0) { this->CopyFrom(sa); return; - } - utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); + } + utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); const Entry *a = sa.data, *a_end = sa.data + sa.size; const Entry *b = sb.data, *b_end = sb.data + sb.size; this->size = sa.size + sb.size; @@ -500,7 +502,7 @@ struct GKSummary { aprev_rmin = a->rmin; ++dst; ++a; } else { - *dst = Entry(aprev_rmin + b->rmin, + *dst = Entry(aprev_rmin + b->rmin, b->rmax + a->rmax - 1, b->value); bprev_rmin = b->rmin; ++dst; ++b; @@ -537,15 +539,15 @@ class QuantileSketchTemplate { /*! \brief type of summary type */ typedef TSummary Summary; /*! \brief the entry type */ - typedef typename Summary::Entry Entry; + typedef typename Summary::Entry Entry; /*! \brief same as summary, but use STL to backup the space */ struct SummaryContainer : public Summary { std::vector space; - SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { + SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { this->space = src.space; this->data = BeginPtr(this->space); } - SummaryContainer(void) : Summary(NULL, 0) { + SummaryContainer(void) : Summary(NULL, 0) { } /*! \brief reserve space for summary */ inline void Reserve(size_t size) { @@ -554,7 +556,7 @@ class QuantileSketchTemplate { this->data = BeginPtr(space); } } - /*! + /*! * \brief set the space to be merge of all Summary arrays * \param begin begining position in th summary array * \param end ending position in the Summary array @@ -597,7 +599,7 @@ class QuantileSketchTemplate { } /*! \brief save the data structure into stream */ template - inline void Save(TStream &fo) const { + inline void Save(TStream &fo) const { // NOLINT(*) fo.Write(&(this->size), sizeof(this->size)); if (this->size != 0) { fo.Write(this->data, this->size * sizeof(Entry)); @@ -605,15 +607,16 @@ class QuantileSketchTemplate { } /*! \brief load data structure from input stream */ template - inline void Load(TStream &fi) { + inline void Load(TStream &fi) { // NOLINT(*) utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1"); this->Reserve(this->size); if (this->size != 0) { - utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2"); + utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, + "invalid SummaryArray 2"); } } }; - /*! + /*! * \brief intialize the quantile sketch, given the performance specification * \param maxn maximum number of data points can be feed into sketch * \param eps accuracy level of summary @@ -741,8 +744,8 @@ class QuantileSketchTemplate { * \tparam DType type of data content * \tparam RType type of rank */ -template -class WQuantileSketch : +template +class WQuantileSketch : public QuantileSketchTemplate >{ }; @@ -751,8 +754,8 @@ class WQuantileSketch : * \tparam DType type of data content * \tparam RType type of rank */ -template -class WXQuantileSketch : +template +class WXQuantileSketch : public QuantileSketchTemplate >{ }; /*! @@ -760,11 +763,11 @@ class WXQuantileSketch : * \tparam DType type of data content * \tparam RType type of rank */ -template -class GKQuantileSketch : +template +class GKQuantileSketch : public QuantileSketchTemplate >{ }; -} // utils -} // xgboost -#endif +} // namespace utils +} // namespace xgboost +#endif // XGBOOST_UTILS_QUANTILE_H_ diff --git a/src/utils/random.h b/src/utils/random.h index 1e3e617f9..7d52c2ae7 100644 --- a/src/utils/random.h +++ b/src/utils/random.h @@ -1,12 +1,14 @@ -#ifndef XGBOOST_UTILS_RANDOM_H_ -#define XGBOOST_UTILS_RANDOM_H_ /*! + * Copyright 2014 by Contributors * \file xgboost_random.h * \brief PRNG to support random number generation * \author Tianqi Chen: tianqi.tchen@gmail.com * * Use standard PRNG from stdlib */ +#ifndef XGBOOST_UTILS_RANDOM_H_ +#define XGBOOST_UTILS_RANDOM_H_ + #include #include #include @@ -23,11 +25,11 @@ inline void Seed(unsigned seed) { } /*! \brief basic function, uniform */ inline double Uniform(void) { - return static_cast(rand()) / (static_cast(RAND_MAX)+1.0); + return static_cast(rand()) / (static_cast(RAND_MAX)+1.0); // NOLINT(*) } /*! \brief return a real numer uniform in (0,1) */ inline double NextDouble2(void) { - return (static_cast(rand()) + 1.0) / (static_cast(RAND_MAX)+2.0); + return (static_cast(rand()) + 1.0) / (static_cast(RAND_MAX)+2.0); // NOLINT(*) } /*! \brief return x~N(0,1) */ inline double Normal(void) { @@ -73,7 +75,7 @@ inline void Shuffle(T *data, size_t sz) { } // random shuffle the data inside, require PRNG template -inline void Shuffle(std::vector &data) { +inline void Shuffle(std::vector &data) { // NOLINT(*) Shuffle(&data[0], data.size()); } @@ -81,17 +83,18 @@ inline void Shuffle(std::vector &data) { struct Random{ /*! \brief set random number seed */ inline void Seed(unsigned sd) { - this->rseed = sd; -#if defined(_MSC_VER)||defined(_WIN32) - ::xgboost::random::Seed(sd); + this->rseed = sd; +#if defined(_MSC_VER) || defined(_WIN32) + ::xgboost::random::Seed(sd); #endif } /*! \brief return a real number uniform in [0,1) */ inline double RandDouble(void) { - // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe - // For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general - // todo, replace with another PRNG -#if defined(_MSC_VER)||defined(_WIN32)||defined(XGBOOST_STRICT_CXX98_) + // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe + // For cygwin and mingw, this can slows down parallelism, + // but rand_r is only used in objective-inl.hpp, won't affect speed in general + // todo, replace with another PRNG +#if defined(_MSC_VER) || defined(_WIN32) || defined(XGBOOST_STRICT_CXX98_) return Uniform(); #else return static_cast(rand_r(&rseed)) / (static_cast(RAND_MAX) + 1.0); diff --git a/src/utils/thread.h b/src/utils/thread.h index ef6335a74..a6e8e7fdc 100644 --- a/src/utils/thread.h +++ b/src/utils/thread.h @@ -1,16 +1,17 @@ -#ifndef XGBOOST_UTILS_THREAD_H -#define XGBOOST_UTILS_THREAD_H /*! + * Copyright by Contributors * \file thread.h - * \brief this header include the minimum necessary resource for multi-threading + * \brief this header include the minimum necessary resource + * for multi-threading that can be compiled in windows, linux, mac * \author Tianqi Chen - * Acknowledgement: this file is adapted from SVDFeature project, by same author. - * The MAC support part of this code is provided by Artemy Kolchinsky */ +#ifndef XGBOOST_UTILS_THREAD_H_ // NOLINT(*) +#define XGBOOST_UTILS_THREAD_H_ // NOLINT(*) + #ifdef _MSC_VER -#include "utils.h" #include #include +#include "./utils.h" namespace xgboost { namespace utils { /*! \brief simple semaphore used for synchronization */ @@ -18,29 +19,80 @@ class Semaphore { public : inline void Init(int init_val) { sem = CreateSemaphore(NULL, init_val, 10, NULL); - utils::Assert(sem != NULL, "create Semaphore error"); + utils::Check(sem != NULL, "create Semaphore error"); } inline void Destroy(void) { CloseHandle(sem); } inline void Wait(void) { - utils::Assert(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error"); + utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error"); } inline void Post(void) { - utils::Assert(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); + utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); } + private: HANDLE sem; }; + +/*! \brief mutex under windows */ +class Mutex { + public: + inline void Init(void) { + utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0, + "Mutex::Init fail"); + } + inline void Lock(void) { + EnterCriticalSection(&mutex); + } + inline void Unlock(void) { + LeaveCriticalSection(&mutex); + } + inline void Destroy(void) { + DeleteCriticalSection(&mutex); + } + + private: + friend class ConditionVariable; + CRITICAL_SECTION mutex; +}; + +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + InitializeConditionVariable(&cond); + } + // destroy the thread + inline void Destroy(void) { + // DeleteConditionVariable(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0, + "ConditionVariable:Wait fail"); + } + inline void Broadcast(void) { + WakeAllConditionVariable(&cond); + } + inline void Signal(void) { + WakeConditionVariable(&cond); + } + + private: + CONDITION_VARIABLE cond; +}; + /*! \brief simple thread that wraps windows thread */ class Thread { private: HANDLE thread_handle; - unsigned thread_id; + unsigned thread_id; public: - inline void Start(unsigned int __stdcall entry(void*), void *param) { + inline void Start(unsigned int __stdcall entry(void*p), void *param) { thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id); - } + } inline int Join(void) { WaitForSingleObject(thread_handle, INFINITE); return 0; @@ -54,39 +106,41 @@ inline void ThreadExit(void *status) { } // namespace utils } // namespace xgboost #else -// thread interface using g++ -extern "C" { +// thread interface using g++ #include #include -} +#include namespace xgboost { namespace utils { /*!\brief semaphore class */ class Semaphore { #ifdef __APPLE__ + private: sem_t* semPtr; - char sema_name[20]; + char sema_name[20]; + private: inline void GenRandomString(char *s, const int len) { - static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; + static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; for (int i = 0; i < len; ++i) { s[i] = alphanum[rand() % (sizeof(alphanum) - 1)]; } s[len] = 0; } + public: inline void Init(int init_val) { - sema_name[0]='/'; - sema_name[1]='s'; - sema_name[2]='e'; - sema_name[3]='/'; + sema_name[0] = '/'; + sema_name[1] = 's'; + sema_name[2] = 'e'; + sema_name[3] = '/'; GenRandomString(&sema_name[4], 16); - if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) { + if ((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) { perror("sem_open"); exit(1); } - utils::Assert(semPtr != NULL, "create Semaphore error"); + utils::Check(semPtr != NULL, "create Semaphore error"); } inline void Destroy(void) { if (sem_close(semPtr) == -1) { @@ -103,53 +157,93 @@ class Semaphore { } inline void Post(void) { sem_post(semPtr); - } + } #else + private: sem_t sem; + public: inline void Init(int init_val) { - sem_init(&sem, 0, init_val); + if (sem_init(&sem, 0, init_val) != 0) { + utils::Error("Semaphore.Init:%s", strerror(errno)); + } } inline void Destroy(void) { - sem_destroy(&sem); + if (sem_destroy(&sem) != 0) { + utils::Error("Semaphore.Destroy:%s", strerror(errno)); + } } inline void Wait(void) { - sem_wait(&sem); + if (sem_wait(&sem) != 0) { + utils::Error("Semaphore.Wait:%s", strerror(errno)); + } } inline void Post(void) { - sem_post(&sem); + if (sem_post(&sem) != 0) { + utils::Error("Semaphore.Post:%s", strerror(errno)); + } } - #endif + #endif }; -// helper for c thread -// used to strictly call c++ function from pthread -struct ThreadContext { - void *(*entry)(void*); - void *param; -}; -extern "C" { - inline void *RunThreadContext(void *ctx_) { - ThreadContext *ctx = reinterpret_cast(ctx_); - void *ret = (*ctx->entry)(ctx->param); - delete ctx; - return ret; +// mutex that works with pthread +class Mutex { + public: + inline void Init(void) { + pthread_mutex_init(&mutex, NULL); } -} + inline void Lock(void) { + pthread_mutex_lock(&mutex); + } + inline void Unlock(void) { + pthread_mutex_unlock(&mutex); + } + inline void Destroy(void) { + pthread_mutex_destroy(&mutex); + } + + private: + friend class ConditionVariable; + pthread_mutex_t mutex; +}; + +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + pthread_cond_init(&cond, NULL); + } + // destroy the thread + inline void Destroy(void) { + pthread_cond_destroy(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + pthread_cond_wait(&cond, &(mutex->mutex)); + } + inline void Broadcast(void) { + pthread_cond_broadcast(&cond); + } + inline void Signal(void) { + pthread_cond_signal(&cond); + } + + private: + pthread_cond_t cond; +}; + /*!\brief simple thread class */ class Thread { private: - pthread_t thread; - + pthread_t thread; public : - inline void Start(void *entry(void*), void *param) { + inline void Start(void * entry(void*), void *param) { // NOLINT(*) pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - ThreadContext *ctx = new ThreadContext(); - ctx->entry = entry; ctx->param = param; - pthread_create(&thread, &attr, RunThreadContext, ctx); + pthread_create(&thread, &attr, entry, param); } inline int Join(void) { void *status; @@ -159,9 +253,8 @@ class Thread { inline void ThreadExit(void *status) { pthread_exit(status); } - } // namespace utils } // namespace xgboost #define XGBOOST_THREAD_PREFIX void * -#endif -#endif +#endif // Linux +#endif // XGBOOST_UTILS_THREAD_H_ NOLINT(*) diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h index 45da6ec84..2119f53ab 100644 --- a/src/utils/thread_buffer.h +++ b/src/utils/thread_buffer.h @@ -1,10 +1,12 @@ -#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_ -#define XGBOOST_UTILS_THREAD_BUFFER_H_ /*! + * Copyright 2014 by Contributors * \file thread_buffer.h * \brief multi-thread buffer, iterator, can be used to create parallel pipeline * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_ +#define XGBOOST_UTILS_THREAD_BUFFER_H_ + #include #include #include @@ -27,7 +29,7 @@ class ThreadBuffer { this->buf_size = 30; } ~ThreadBuffer(void) { - if(init_end) this->Destroy(); + if (init_end) this->Destroy(); } /*!\brief set parameter, will also pass the parameter to factory */ inline void SetParam(const char *name, const char *val) { @@ -38,7 +40,7 @@ class ThreadBuffer { /*! * \brief initalize the buffered iterator * \param param a initialize parameter that will pass to factory, ignore it if not necessary - * \return false if the initlization can't be done, e.g. buffer file hasn't been created + * \return false if the initlization can't be done, e.g. buffer file hasn't been created */ inline bool Init(void) { if (!factory.Init()) return false; @@ -49,7 +51,7 @@ class ThreadBuffer { this->init_end = true; this->StartLoader(); return true; - } + } /*!\brief place the iterator before first value */ inline void BeforeFirst(void) { // wait till last loader end @@ -70,7 +72,7 @@ class ThreadBuffer { loading_need.Post(); // set buffer value buf_index = 0; - } + } /*! \brief destroy the buffer iterator, will deallocate the buffer */ inline void Destroy(void) { // wait until the signal is consumed @@ -78,7 +80,7 @@ class ThreadBuffer { loading_need.Post(); loader_thread.Join(); loading_need.Destroy(); - loading_end.Destroy(); + loading_end.Destroy(); for (size_t i = 0; i < bufA.size(); ++i) { factory.FreeSpace(bufA[i]); } @@ -88,37 +90,38 @@ class ThreadBuffer { bufA.clear(); bufB.clear(); factory.Destroy(); this->init_end = false; - } + } /*! * \brief get the next element needed in buffer * \param elem element to store into * \return whether reaches end of data */ - inline bool Next(Elem &elem) { + inline bool Next(Elem &elem) { // NOLINT(*) // end of buffer try to switch if (buf_index == buf_size) { this->SwitchBuffer(); buf_index = 0; } - if (buf_index >= (current_buf ? endA : endB)) { + if (buf_index >= (current_buf ? endA : endB)) { return false; } std::vector &buf = current_buf ? bufA : bufB; elem = buf[buf_index]; ++buf_index; return true; - } + } /*! * \brief get the factory object */ inline ElemFactory &get_factory(void) { return factory; } - inline const ElemFactory &get_factory(void) const{ + inline const ElemFactory &get_factory(void) const { return factory; } // size of buffer int buf_size; + private: // factory object used to load configures ElemFactory factory; @@ -147,15 +150,15 @@ class ThreadBuffer { * this implementation is like producer-consumer style */ inline void RunLoader(void) { - while(!destroy_signal) { + while (!destroy_signal) { // sleep until loading is needed - loading_need.Wait(); + loading_need.Wait(); std::vector &buf = current_buf ? bufB : bufA; int i; for (i = 0; i < buf_size ; ++i) { if (!factory.LoadNext(buf[i])) { int &end = current_buf ? endB : endA; - end = i; // marks the termination + end = i; // marks the termination break; } } @@ -166,14 +169,14 @@ class ThreadBuffer { } /*!\brief entry point of loader thread */ inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) { - static_cast< ThreadBuffer* >(pthread)->RunLoader(); + static_cast< ThreadBuffer* >(pthread)->RunLoader(); return NULL; } /*!\brief start loader thread */ inline void StartLoader(void) { destroy_signal = false; // set param - current_buf = 1; + current_buf = 1; loading_need.Init(1); loading_end .Init(0); // reset terminate limit @@ -185,8 +188,8 @@ class ThreadBuffer { current_buf = 0; // wake loader for next part data_loaded = false; - loading_need.Post(); - buf_index = 0; + loading_need.Post(); + buf_index = 0; } /*!\brief switch double buffer */ inline void SwitchBuffer(void) { @@ -198,7 +201,6 @@ class ThreadBuffer { loading_need.Post(); } }; - } // namespace utils } // namespace xgboost -#endif +#endif // XGBOOST_UTILS_THREAD_BUFFER_H_ diff --git a/src/utils/utils.h b/src/utils/utils.h index e6026c3a6..7a8f18390 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -1,15 +1,18 @@ -#ifndef XGBOOST_UTILS_UTILS_H_ -#define XGBOOST_UTILS_UTILS_H_ /*! + * Copyright 2014 by Contributors * \file utils.h * \brief simple utils to support the code * \author Tianqi Chen */ +#ifndef XGBOOST_UTILS_UTILS_H_ +#define XGBOOST_UTILS_UTILS_H_ + #define _CRT_SECURE_NO_WARNINGS #include #include #include #include +#include #ifndef XGBOOST_STRICT_CXX98_ #include @@ -19,18 +22,18 @@ #define fopen64 std::fopen #endif #ifdef _MSC_VER -// NOTE: sprintf_s is not equivalent to snprintf, +// NOTE: sprintf_s is not equivalent to snprintf, // they are equivalent when success, which is sufficient for our case #define snprintf sprintf_s #define vsnprintf vsprintf_s #else #ifdef _FILE_OFFSET_BITS #if _FILE_OFFSET_BITS == 32 -#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit") +#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit") #endif #endif -#ifdef __APPLE__ +#ifdef __APPLE__ #define off64_t off_t #define fopen64 std::fopen #endif @@ -58,21 +61,20 @@ namespace utils { const int kPrintBuffer = 1 << 12; #ifndef XGBOOST_CUSTOMIZE_MSG_ -/*! +/*! * \brief handling of Assert error, caused by in-apropriate input - * \param msg error message + * \param msg error message */ inline void HandleAssertError(const char *msg) { fprintf(stderr, "AssertError:%s\n", msg); exit(-1); } -/*! +/*! * \brief handling of Check error, caused by in-apropriate input - * \param msg error message + * \param msg error message */ inline void HandleCheckError(const char *msg) { - fprintf(stderr, "%s\n", msg); - exit(-1); + throw std::runtime_error(msg); } inline void HandlePrint(const char *msg) { printf("%s", msg); @@ -158,7 +160,7 @@ inline std::FILE *FopenCheck(const char *fname, const char *flag) { // easy utils that can be directly acessed in xgboost /*! \brief get the beginning address of a vector */ template -inline T *BeginPtr(std::vector &vec) { +inline T *BeginPtr(std::vector &vec) { // NOLINT(*) if (vec.size() == 0) { return NULL; } else { @@ -174,7 +176,7 @@ inline const T *BeginPtr(const std::vector &vec) { return &vec[0]; } } -inline char* BeginPtr(std::string &str) { +inline char* BeginPtr(std::string &str) { // NOLINT(*) if (str.length() == 0) return NULL; return &str[0]; } diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index 769e3be3b..773001503 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -1,18 +1,20 @@ +// Copyright 2014 by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX #include #include #include +#include #include "./sync/sync.h" -#include "io/io.h" -#include "utils/utils.h" -#include "utils/config.h" -#include "learner/learner-inl.hpp" +#include "./io/io.h" +#include "./utils/utils.h" +#include "./utils/config.h" +#include "./learner/learner-inl.hpp" namespace xgboost { /*! - * \brief wrapping the training process + * \brief wrapping the training process */ class BoostLearnTask { public: @@ -20,7 +22,7 @@ class BoostLearnTask { if (argc < 2) { printf("Usage: \n"); return 0; - } + } utils::ConfigIterator itr(argv[1]); while (itr.Next()) { this->SetParam(itr.name(), itr.val()); @@ -44,10 +46,10 @@ class BoostLearnTask { } if (rabit::IsDistributed() && data_split == "NONE") { this->SetParam("dsplit", "row"); - } + } if (rabit::GetRank() != 0) { this->SetParam("silent", "2"); - } + } this->InitData(); if (task == "train") { @@ -90,12 +92,14 @@ class BoostLearnTask { if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val); if (!strncmp("eval[", name, 5)) { char evname[256]; - utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); + utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, + "must specify evaluation name for display"); eval_data_names.push_back(std::string(evname)); eval_data_paths.push_back(std::string(val)); } learner.SetParam(name, val); } + public: BoostLearnTask(void) { // default parameters @@ -119,12 +123,13 @@ class BoostLearnTask { save_with_pbuffer = 0; data = NULL; } - ~BoostLearnTask(void){ - for (size_t i = 0; i < deval.size(); i++){ + ~BoostLearnTask(void) { + for (size_t i = 0; i < deval.size(); i++) { delete deval[i]; } if (data != NULL) delete data; } + private: inline void InitData(void) { if (strchr(train_path.c_str(), '%') != NULL) { @@ -151,14 +156,14 @@ class BoostLearnTask { loadsplit)); devalall.push_back(deval.back()); } - + std::vector dcache(1, data); - for (size_t i = 0; i < deval.size(); ++ i) { + for (size_t i = 0; i < deval.size(); ++i) { dcache.push_back(deval[i]); } // set cache data to be all training and evaluation data learner.SetCacheData(dcache); - + // add training set to evaluation set if needed if (eval_train != 0) { devalall.push_back(data); @@ -178,13 +183,13 @@ class BoostLearnTask { int version = rabit::LoadCheckPoint(&learner); if (version == 0) this->InitLearner(); const time_t start = time(NULL); - unsigned long elapsed = 0; + unsigned long elapsed = 0; // NOLINT(*) learner.CheckInit(data); bool allow_lazy = learner.AllowLazyCheckPoint(); for (int i = version / 2; i < num_round; ++i) { - elapsed = (unsigned long)(time(NULL) - start); - if (version % 2 == 0) { + elapsed = (unsigned long)(time(NULL) - start); // NOLINT(*) + if (version % 2 == 0) { if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); learner.UpdateOneIter(i, *data); if (allow_lazy) { @@ -196,7 +201,7 @@ class BoostLearnTask { } utils::Assert(version == rabit::VersionNumber(), "consistent check"); std::string res = learner.EvalOneIter(i, devalall, eval_data_names); - if (rabit::IsDistributed()){ + if (rabit::IsDistributed()) { if (rabit::GetRank() == 0) { rabit::TrackerPrintf("%s\n", res.c_str()); } @@ -215,29 +220,29 @@ class BoostLearnTask { } version += 1; utils::Assert(version == rabit::VersionNumber(), "consistent check"); - elapsed = (unsigned long)(time(NULL) - start); + elapsed = (unsigned long)(time(NULL) - start); // NOLINT(*) } // always save final round if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE") { - if (model_out == "NULL"){ + if (model_out == "NULL") { this->SaveModel(num_round - 1); } else { this->SaveModel(model_out.c_str()); } } - if (!silent){ + if (!silent) { printf("\nupdating end, %lu sec in all\n", elapsed); } } inline void TaskEval(void) { learner.EvalOneIter(0, devalall, eval_data_names); } - inline void TaskDump(void){ + inline void TaskDump(void) { FILE *fo = utils::FopenCheck(name_dump.c_str(), "w"); std::vector dump = learner.DumpModel(fmap, dump_model_stats != 0); - for (size_t i = 0; i < dump.size(); ++ i) { - fprintf(fo,"booster[%lu]:\n", i); - fprintf(fo,"%s", dump[i].c_str()); + for (size_t i = 0; i < dump.size(); ++i) { + fprintf(fo, "booster[%lu]:\n", i); + fprintf(fo, "%s", dump[i].c_str()); } fclose(fo); } @@ -247,14 +252,15 @@ class BoostLearnTask { } inline void SaveModel(int i) const { char fname[256]; - sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1); + utils::SPrintf(fname, sizeof(fname), + "%s/%04d.model", model_dir_path.c_str(), i + 1); this->SaveModel(fname); } inline void TaskPred(void) { std::vector preds; if (!silent) printf("start prediction...\n"); learner.Predict(*data, pred_margin != 0, &preds, ntree_limit); - if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); + if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); FILE *fo; if (name_pred != "stdout") { fo = utils::FopenCheck(name_pred.c_str(), "w"); @@ -266,6 +272,7 @@ class BoostLearnTask { } if (fo != stdout) fclose(fo); } + private: /*! \brief whether silent */ int silent; @@ -273,7 +280,7 @@ class BoostLearnTask { int load_part; /*! \brief whether use auto binary buffer */ int use_buffer; - /*! \brief whether evaluate training statistics */ + /*! \brief whether evaluate training statistics */ int eval_train; /*! \brief number of boosting iterations */ int num_round; @@ -309,6 +316,7 @@ class BoostLearnTask { std::vector eval_data_paths; /*! \brief the names of the evaluation data used in output log */ std::vector eval_data_names; + private: io::DataMatrix* data; std::vector deval; @@ -316,9 +324,9 @@ class BoostLearnTask { utils::FeatMap fmap; learner::BoostLearner learner; }; -} +} // namespace xgboost -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) { xgboost::BoostLearnTask tsk; tsk.SetParam("seed", "0"); int ret = tsk.Run(argc, argv); diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 000000000..19e34d5df --- /dev/null +++ b/tests/README.md @@ -0,0 +1 @@ +This folder contains tetstcases for xgboost. \ No newline at end of file diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py new file mode 100644 index 000000000..77d19595b --- /dev/null +++ b/tests/python/test_basic.py @@ -0,0 +1,31 @@ +import numpy as np +import xgboost as xgb + +dpath = 'demo/data/' + +def test_basic(): + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + # specify validations set to watch performance + watchlist = [(dtest,'eval'), (dtrain,'train')] + num_round = 2 + bst = xgb.train(param, dtrain, num_round, watchlist) + # this is prediction + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + # error must be smaller than 10% + assert err < 0.1 + + # save dmatrix into binary buffer + dtest.save_binary('dtest.buffer') + # save model + bst.save_model('xgb.model') + # load model and data in + bst2 = xgb.Booster(model_file='xgb.model') + dtest2 = xgb.DMatrix('dtest.buffer') + preds2 = bst2.predict(dtest2) + # assert they are the same + assert np.sum(np.abs(preds2-preds)) == 0 + diff --git a/windows/xgboost.sln b/windows/xgboost.sln index f2b08a456..7bd8db5b2 100644 --- a/windows/xgboost.sln +++ b/windows/xgboost.sln @@ -10,6 +10,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "..\subtree\rabit\windows\rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboostjavawrapper", "xgboostjavawrapper\xgboostjavawrapper.vcxproj", "{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 @@ -41,6 +43,14 @@ Global {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.Build.0 = Release|Win32 {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.ActiveCfg = Release|x64 {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.Build.0 = Release|x64 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.ActiveCfg = Debug|Win32 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.Build.0 = Debug|Win32 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.ActiveCfg = Debug|x64 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.Build.0 = Debug|x64 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.ActiveCfg = Release|Win32 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.Build.0 = Release|Win32 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.ActiveCfg = Release|x64 + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/windows/xgboostjavawrapper/xgboostjavawrapper.vcxproj b/windows/xgboostjavawrapper/xgboostjavawrapper.vcxproj new file mode 100644 index 000000000..e55dfff71 --- /dev/null +++ b/windows/xgboostjavawrapper/xgboostjavawrapper.vcxproj @@ -0,0 +1,129 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA} + xgboost_wrapper + + + + DynamicLibrary + true + MultiByte + + + DynamicLibrary + true + MultiByte + + + DynamicLibrary + false + true + MultiByte + + + DynamicLibrary + false + true + MultiByte + + + + + + + + + + + + + + + + + + + $(SolutionDir)$(Platform)\$(Configuration)\ + + + + Level3 + Disabled + + + true + + + + + Level3 + Disabled + + + true + + + + + Level3 + MaxSpeed + true + true + true + $(JAVA_HOME)\include;$(JAVA_HOME)\include\win32;%(AdditionalIncludeDirectories) + + + true + true + true + + + + + Level3 + MaxSpeed + true + true + true + MultiThreaded + $(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories) + + + true + true + true + ws2_32.lib;%(AdditionalDependencies) + + + + + + \ No newline at end of file diff --git a/wrapper/setup.py b/wrapper/setup.py index 52bf1cf82..5365d61b0 100644 --- a/wrapper/setup.py +++ b/wrapper/setup.py @@ -1,9 +1,12 @@ +# pylint: disable=invalid-name +"""Setup xgboost package.""" import os import platform from setuptools import setup class XGBoostLibraryNotFound(Exception): + """Exception to raise when xgboost library cannot be found.""" pass @@ -15,7 +18,7 @@ if os.name == 'nt': dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/')) else: dll_path.append(os.path.join(curr_dir, '../windows/Release/')) - + if os.name == 'nt': dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 0280d87b3..7a601424c 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -6,7 +6,7 @@ Version: 0.40 Authors: Tianqi Chen, Bing Xu Early stopping by Zygmunt ZajÄ…c """ - +# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name from __future__ import absolute_import import os @@ -28,20 +28,24 @@ except ImportError: SKLEARN_INSTALLED = False class XGBoostLibraryNotFound(Exception): + """Error throwed by when xgboost is not found""" pass class XGBoostError(Exception): + """Error throwed by xgboost trainer.""" pass __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: - string_types = str, + # pylint: disable=invalid-name + STRING_TYPES = str, else: - string_types = basestring, - + # pylint: disable=invalid-name + STRING_TYPES = basestring, def load_xglib(): + """Load the xgboost library.""" curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) dll_path = [curr_path] if os.name == 'nt': @@ -55,36 +59,33 @@ def load_xglib(): dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] if len(dll_path) == 0: - raise XGBoostLibraryNotFound('cannot find find the files in the candicate path ' + str(dll_path)) + raise XGBoostLibraryNotFound( + 'cannot find find the files in the candicate path ' + str(dll_path)) lib = ctypes.cdll.LoadLibrary(lib_path[0]) - - # DMatrix functions - lib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p - lib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p - lib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p - lib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p - lib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p - lib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float) - lib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint) - lib.XGDMatrixNumRow.restype = ctypes.c_ulong - - # Booster functions - lib.XGBoosterCreate.restype = ctypes.c_void_p - lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float) - lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p - lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) - lib.XGBoosterGetModelRaw.restype = ctypes.POINTER(ctypes.c_char) - lib.XGBoosterLoadModelFromBuffer.restype = ctypes.c_void_p + lib.XGBGetLastError.restype = ctypes.c_char_p return lib # load the XGBoost library globally -xglib = load_xglib() +_LIB = load_xglib() + +def _check_call(ret): + """Check the return value of C API call + + This function will raise exception when error occurs. + Wrap every API call with this function + + Parameters + ---------- + ret : int + return value from API calls + """ + if ret != 0: + raise XGBoostError(_LIB.XGBGetLastError()) def ctypes2numpy(cptr, length, dtype): - """ - Convert a ctypes pointer array to a numpy array. + """Convert a ctypes pointer array to a numpy array. """ if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)): raise RuntimeError('expected float pointer') @@ -95,6 +96,7 @@ def ctypes2numpy(cptr, length, dtype): def ctypes2buffer(cptr, length): + """Convert ctypes pointer to buffer type.""" if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): raise RuntimeError('expected char pointer') res = bytearray(length) @@ -105,14 +107,17 @@ def ctypes2buffer(cptr, length): def c_str(string): + """Convert a python string to cstring.""" return ctypes.c_char_p(string.encode('utf-8')) def c_array(ctype, values): + """Convert a python string to c array.""" return (ctype * len(values))(*values) class DMatrix(object): + """Data Matrix used in XGBoost.""" def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ Data matrix used in XGBoost. @@ -135,8 +140,11 @@ class DMatrix(object): if data is None: self.handle = None return - if isinstance(data, string_types): - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent))) + if isinstance(data, STRING_TYPES): + self.handle = ctypes.c_void_p() + _check_call(_LIB.XGDMatrixCreateFromFile(c_str(data), + int(silent), + ctypes.byref(self.handle))) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) elif isinstance(data, scipy.sparse.csc_matrix): @@ -160,11 +168,12 @@ class DMatrix(object): """ if len(csr.indices) != len(csr.data): raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR( - c_array(ctypes.c_ulong, csr.indptr), - c_array(ctypes.c_uint, csr.indices), - c_array(ctypes.c_float, csr.data), - len(csr.indptr), len(csr.data))) + self.handle = ctypes.c_void_p() + _check_call(_LIB.XGDMatrixCreateFromCSR(c_array(ctypes.c_ulong, csr.indptr), + c_array(ctypes.c_uint, csr.indices), + c_array(ctypes.c_float, csr.data), + len(csr.indptr), len(csr.data), + ctypes.byref(self.handle))) def _init_from_csc(self, csc): """ @@ -172,45 +181,103 @@ class DMatrix(object): """ if len(csc.indices) != len(csc.data): raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC( - c_array(ctypes.c_ulong, csc.indptr), - c_array(ctypes.c_uint, csc.indices), - c_array(ctypes.c_float, csc.data), - len(csc.indptr), len(csc.data))) + self.handle = ctypes.c_void_p() + _check_call(_LIB.XGDMatrixCreateFromCSC(c_array(ctypes.c_ulong, csc.indptr), + c_array(ctypes.c_uint, csc.indices), + c_array(ctypes.c_float, csc.data), + len(csc.indptr), len(csc.data), + ctypes.byref(self.handle))) def _init_from_npy2d(self, mat, missing): """ Initialize data from a 2-D numpy matrix. """ data = np.array(mat.reshape(mat.size), dtype=np.float32) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat( - data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - mat.shape[0], mat.shape[1], ctypes.c_float(missing))) + self.handle = ctypes.c_void_p() + _check_call(_LIB.XGDMatrixCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + mat.shape[0], mat.shape[1], + ctypes.c_float(missing), + ctypes.byref(self.handle))) def __del__(self): - xglib.XGDMatrixFree(self.handle) + _check_call(_LIB.XGDMatrixFree(self.handle)) def get_float_info(self, field): + """Get float property from the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + Returns + ------- + info : array + a numpy array of float information of the data + """ length = ctypes.c_ulong() - ret = xglib.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length)) + ret = ctypes.POINTER(ctypes.c_float)() + _check_call(_LIB.XGDMatrixGetFloatInfo(self.handle, + c_str(field), + ctypes.byref(length), + ctypes.byref(ret))) return ctypes2numpy(ret, length.value, np.float32) def get_uint_info(self, field): + """Get unsigned integer property from the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + Returns + ------- + info : array + a numpy array of float information of the data + """ length = ctypes.c_ulong() - ret = xglib.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length)) + ret = ctypes.POINTER(ctypes.c_uint)() + _check_call(_LIB.XGDMatrixGetUIntInfo(self.handle, + c_str(field), + ctypes.byref(length), + ctypes.byref(ret))) return ctypes2numpy(ret, length.value, np.uint32) def set_float_info(self, field, data): - xglib.XGDMatrixSetFloatInfo(self.handle, c_str(field), - c_array(ctypes.c_float, data), len(data)) + """Set float type property into the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + data: numpy array + The array ofdata to be set + """ + _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle, + c_str(field), + c_array(ctypes.c_float, data), + len(data))) def set_uint_info(self, field, data): - xglib.XGDMatrixSetUIntInfo(self.handle, c_str(field), - c_array(ctypes.c_uint, data), len(data)) + """Set uint type property into the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + data: numpy array + The array ofdata to be set + """ + _check_call(_LIB.XGDMatrixSetUIntInfo(self.handle, + c_str(field), + c_array(ctypes.c_uint, data), + len(data))) def save_binary(self, fname, silent=True): - """ - Save DMatrix to an XGBoost buffer. + """Save DMatrix to an XGBoost buffer. Parameters ---------- @@ -219,74 +286,78 @@ class DMatrix(object): silent : bool (optional; default: True) If set, the output is suppressed. """ - xglib.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent)) + _check_call(_LIB.XGDMatrixSaveBinary(self.handle, + c_str(fname), + int(silent))) def set_label(self, label): - """set label of dmatrix - Args: - label: list - label for DMatrix - Returns: - None + """Set label of dmatrix + + Parameters + ---------- + label: array like + The label information to be set into DMatrix """ self.set_float_info('label', label) def set_weight(self, weight): - """ - Set weight of each instance. + """ Set weight of each instance. Parameters ---------- - weight : float - Weight for positive instance. + weight : array like + Weight for each data point """ self.set_float_info('weight', weight) def set_base_margin(self, margin): - """ - set base margin of booster to start from - this can be used to specify a prediction value of + """ Set base margin of booster to start from. + + This can be used to specify a prediction value of existing model to be base_margin However, remember margin is needed, instead of transformed prediction e.g. for logistic regression: need to put in value before logistic transformation see also example/demo.py + + Parameters + ---------- + margin: array like + Prediction margin of each datapoint """ self.set_float_info('base_margin', margin) def set_group(self, group): - """ - Set group size of DMatrix (used for ranking). + """Set group size of DMatrix (used for ranking). Parameters ---------- - group : int - Group size. + group : array like + Group size of each group """ - xglib.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group)) + _check_call(_LIB.XGDMatrixSetGroup(self.handle, + c_array(ctypes.c_uint, group), + len(group))) def get_label(self): - """ - Get the label of the DMatrix. + """Get the label of the DMatrix. Returns ------- - label : list + label : array """ return self.get_float_info('label') def get_weight(self): - """ - Get the weight of the DMatrix. + """Get the weight of the DMatrix. Returns ------- - weight : float + weight : array """ return self.get_float_info('weight') def get_base_margin(self): - """ - Get the base margin of the DMatrix. + """Get the base margin of the DMatrix. Returns ------- @@ -295,18 +366,19 @@ class DMatrix(object): return self.get_float_info('base_margin') def num_row(self): - """ - Get the number of rows in the DMatrix. + """Get the number of rows in the DMatrix. Returns ------- number of rows : int """ - return xglib.XGDMatrixNumRow(self.handle) + ret = ctypes.c_ulong() + _check_call(_LIB.XGDMatrixNumRow(self.handle, + ctypes.byref(ret))) + return ret.value def slice(self, rindex): - """ - Slice the DMatrix and return a new DMatrix that only contains `rindex`. + """Slice the DMatrix and return a new DMatrix that only contains `rindex`. Parameters ---------- @@ -319,13 +391,18 @@ class DMatrix(object): A new DMatrix containing only selected indices. """ res = DMatrix(None) - res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix( - self.handle, c_array(ctypes.c_int, rindex), len(rindex))) + res.handle = ctypes.c_void_p() + _check_call(_LIB.XGDMatrixSliceDMatrix(self.handle, + c_array(ctypes.c_int, rindex), + len(rindex), + ctypes.byref(res.handle))) return res class Booster(object): + """"A Booster of of XGBoost.""" def __init__(self, params=None, cache=(), model_file=None): + # pylint: disable=invalid-name """ Learner class. @@ -342,14 +419,15 @@ class Booster(object): if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) - self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache))) + self.handle = ctypes.c_void_p() + _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle))) self.set_param({'seed': 0}) self.set_param(params or {}) if model_file is not None: self.load_model(model_file) def __del__(self): - xglib.XGBoosterFree(self.handle) + _LIB.XGBoosterFree(self.handle) def __getstate__(self): # can't pickle ctypes pointers @@ -367,10 +445,11 @@ class Booster(object): if handle is not None: buf = handle dmats = c_array(ctypes.c_void_p, []) - handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0)) + handle = ctypes.c_void_p() + _check_call(_LIB.XGBoosterCreate(dmats, 0, ctypes.byref(handle))) length = ctypes.c_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length) + _check_call(_LIB.XGBoosterLoadModelFromBuffer(handle, ptr, length)) state['handle'] = handle self.__dict__.update(state) self.set_param({'seed': 0}) @@ -379,11 +458,10 @@ class Booster(object): return self.__deepcopy__() def __deepcopy__(self): - return Booster(model_file = self.save_raw()) + return Booster(model_file=self.save_raw()) def copy(self): - """ - Copy the booster object + """Copy the booster object. Returns -------- @@ -391,15 +469,16 @@ class Booster(object): """ return self.__copy__() - def set_param(self, params, pv=None): + def set_param(self, params, value=None): + """Set parameters into the DMatrix.""" if isinstance(params, collections.Mapping): params = params.items() - elif isinstance(params, string_types) and pv is not None: - params = [(params, pv)] - for k, v in params: - xglib.XGBoosterSetParam(self.handle, c_str(k), c_str(str(v))) + elif isinstance(params, STRING_TYPES) and value is not None: + params = [(params, value)] + for key, val in params: + _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))) - def update(self, dtrain, it, fobj=None): + def update(self, dtrain, iteration, fobj=None): """ Update (one iteration). @@ -407,7 +486,7 @@ class Booster(object): ---------- dtrain : DMatrix Training data. - it : int + iteration : int Current iteration number. fobj : function Customized objective function. @@ -415,7 +494,7 @@ class Booster(object): if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) if fobj is None: - xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) + _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) else: pred = self.predict(dtrain) grad, hess = fobj(pred, dtrain) @@ -438,20 +517,20 @@ class Booster(object): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, - c_array(ctypes.c_float, grad), - c_array(ctypes.c_float, hess), - len(grad)) + _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, + c_array(ctypes.c_float, grad), + c_array(ctypes.c_float, hess), + len(grad))) - def eval_set(self, evals, it=0, feval=None): - """ - Evaluate by a metric. + def eval_set(self, evals, iteration=0, feval=None): + # pylint: disable=invalid-name + """Evaluate a set of data. Parameters ---------- evals : list of tuples (DMatrix, string) List of items to be evaluated. - it : int + iteration : int Current iteration. feval : function Custom evaluation function. @@ -464,20 +543,39 @@ class Booster(object): for d in evals: if not isinstance(d[0], DMatrix): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) - if not isinstance(d[1], string_types): + if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) - return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) + msg = ctypes.c_char_p() + _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration, + dmats, evnames, len(evals), + ctypes.byref(msg))) + return msg.value else: - res = '[%d]' % it - for dm, evname in evals: - name, val = feval(self.predict(dm), dm) + res = '[%d]' % iteration + for dmat, evname in evals: + name, val = feval(self.predict(dmat), dmat) res += '\t%s-%s:%f' % (evname, name, val) return res - def eval(self, mat, name='eval', it=0): - return self.eval_set([(mat, name)], it) + def eval(self, data, name='eval', iteration=0): + """Evaluate the model on mat. + + + Parameters + --------- + data : DMatrix + The dmatrix storing the input. + + name : str (default = 'eval') + The name of the dataset + + + iteration : int (default = 0) + The current iteration number + """ + return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): """ @@ -492,10 +590,13 @@ class Booster(object): ---------- data : DMatrix The dmatrix storing the input. + output_margin : bool Whether to output the raw untransformed margin value. + ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). + pred_leaf : bool When this option is on, the output will be a matrix of (nsample, ntrees) with each record indicating the predicted leaf index of each sample in each tree. @@ -512,8 +613,11 @@ class Booster(object): if pred_leaf: option_mask |= 0x02 length = ctypes.c_ulong() - preds = xglib.XGBoosterPredict(self.handle, data.handle, - option_mask, ntree_limit, ctypes.byref(length)) + preds = ctypes.POINTER(ctypes.c_float)() + _check_call(_LIB.XGBoosterPredict(self.handle, data.handle, + option_mask, ntree_limit, + ctypes.byref(length), + ctypes.byref(preds))) preds = ctypes2numpy(preds, length.value, np.float32) if pred_leaf: preds = preds.astype(np.int32) @@ -531,8 +635,8 @@ class Booster(object): fname : string Output file name """ - if isinstance(fname, string_types): # assume file name - xglib.XGBoosterSaveModel(self.handle, c_str(fname)) + if isinstance(fname, STRING_TYPES): # assume file name + _check_call(_LIB.XGBoosterSaveModel(self.handle, c_str(fname))) else: raise TypeError("fname must be a string") @@ -545,8 +649,10 @@ class Booster(object): a in memory buffer represetation of the model """ length = ctypes.c_ulong() - cptr = xglib.XGBoosterGetModelRaw(self.handle, - ctypes.byref(length)) + cptr = ctypes.POINTER(ctypes.c_char)() + _check_call(_LIB.XGBoosterGetModelRaw(self.handle, + ctypes.byref(length), + ctypes.byref(cptr))) return ctypes2buffer(cptr, length.value) def load_model(self, fname): @@ -559,59 +665,67 @@ class Booster(object): Input file name or memory buffer(see also save_raw) """ if isinstance(fname, str): # assume file name - xglib.XGBoosterLoadModel(self.handle, c_str(fname)) + _LIB.XGBoosterLoadModel(self.handle, c_str(fname)) else: buf = fname length = ctypes.c_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length) + _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)) - def dump_model(self, fo, fmap='', with_stats=False): + def dump_model(self, fout, fmap='', with_stats=False): """ Dump model into a text file. Parameters ---------- - fo : string + foout : string Output file name. fmap : string, optional Name of the file containing feature map names. with_stats : bool (optional) Controls whether the split statistics are output. """ - if isinstance(fo, string_types): - fo = open(fo, 'w') + if isinstance(fout, STRING_TYPES): + fout = open(fout, 'w') need_close = True else: need_close = False ret = self.get_dump(fmap, with_stats) for i in range(len(ret)): - fo.write('booster[{}]:\n'.format(i)) - fo.write(ret[i]) + fout.write('booster[{}]:\n'.format(i)) + fout.write(ret[i]) if need_close: - fo.close() + fout.close() def get_dump(self, fmap='', with_stats=False): """ Returns the dump the model as a list of strings. """ length = ctypes.c_ulong() - sarr = xglib.XGBoosterDumpModel(self.handle, c_str(fmap), - int(with_stats), ctypes.byref(length)) + sarr = ctypes.POINTER(ctypes.c_char_p)() + _check_call(_LIB.XGBoosterDumpModel(self.handle, + c_str(fmap), + int(with_stats), + ctypes.byref(length), + ctypes.byref(sarr))) res = [] for i in range(length.value): res.append(str(sarr[i].decode('ascii'))) return res def get_fscore(self, fmap=''): - """ - Get feature importance of each feature. + """Get feature importance of each feature. + + Parameters + ---------- + fmap: str (optional) + The name of feature map file """ trees = self.get_dump(fmap) fmap = {} for tree in trees: - for l in tree.split('\n'): - arr = l.split('[') + for line in tree.split('\n'): + arr = line.split('[') if len(arr) == 1: continue fid = arr[1].split(']')[0] @@ -624,9 +738,9 @@ class Booster(object): def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - early_stopping_rounds=None,evals_result=None): - """ - Train a booster with given parameters. + early_stopping_rounds=None, evals_result=None): + # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init + """Train a booster with given parameters. Parameters ---------- @@ -663,7 +777,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst = Booster(params, [dtrain] + [d[0] for d in evals]) if evals_result is not None: - if type(evals_result) is not dict: + if not isinstance(evals_result, dict): raise TypeError('evals_result has to be a dictionary') else: evals_name = [d[1] for d in evals] @@ -675,37 +789,38 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.update(dtrain, i, obj) if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, string_types): + if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).",msg) - for key,val in zip(evals_name,res): + res = re.findall(":([0-9.]+).", msg) + for key, val in zip(evals_name, res): evals_result[key].append(val) return bst else: # early stopping - if len(evals) < 1: raise ValueError('For early stopping you need at least one set in evals.') - sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds)) + sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ + evals[-1][1], early_stopping_rounds)) # is params a list of tuples? are we using multiple eval metrics? - if type(params) == list: + if isinstance(params, list): if len(params) != len(dict(params).items()): - raise ValueError('Check your params. Early stopping works with single eval metric only.') + raise ValueError('Check your params.'\ + 'Early stopping works with single eval metric only.') params = dict(params) # either minimize loss or maximize AUC/MAP/NDCG maximize_score = False if 'eval_metric' in params: maximize_metrics = ('auc', 'map', 'ndcg') - if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)): + if any(params['eval_metric'].startswith(x) for x in maximize_metrics): maximize_score = True if maximize_score: @@ -720,7 +835,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.update(dtrain, i, obj) bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, string_types): + if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() @@ -728,8 +843,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).",msg) - for key,val in zip(evals_name,res): + res = re.findall(":([0-9.]+).", msg) + for key, val in zip(evals_name, res): evals_result[key].append(val) score = float(msg.rsplit(':', 1)[1]) @@ -748,17 +863,21 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, return bst class CVPack(object): + """"Auxiliary datastruct to hold one fold of CV.""" def __init__(self, dtrain, dtest, param): + """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) - def update(self, r, fobj): - self.bst.update(self.dtrain, r, fobj) + def update(self, iteration, fobj): + """"Update the boosters for one iteration""" + self.bst.update(self.dtrain, iteration, fobj) - def eval(self, r, feval): - return self.bst.eval_set(self.watchlist, r, feval) + def eval(self, iteration, feval): + """"Evaluate the CVPack for one iteration.""" + return self.bst.eval_set(self.watchlist, iteration, feval) def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): @@ -785,6 +904,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): def aggcv(rlist, show_stdv=True): + # pylint: disable=invalid-name """ Aggregate cross-validation results. """ @@ -794,7 +914,7 @@ def aggcv(rlist, show_stdv=True): arr = line.split() assert ret == arr[0] for it in arr[1:]: - if not isinstance(it, string_types): + if not isinstance(it, STRING_TYPES): it = it.decode() k, v = it.split(':') if k not in cvmap: @@ -802,7 +922,7 @@ def aggcv(rlist, show_stdv=True): cvmap[k].append(float(v)) for k, v in sorted(cvmap.items(), key=lambda x: x[0]): v = np.array(v) - if not isinstance(ret, string_types): + if not isinstance(ret, STRING_TYPES): ret = ret.decode() if show_stdv: ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) @@ -813,8 +933,8 @@ def aggcv(rlist, show_stdv=True): def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): - """ - Cross-validation with given paramaters. + # pylint: disable = invalid-name + """Cross-validation with given paramaters. Parameters ---------- @@ -847,8 +967,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), results = [] cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) for i in range(num_boost_round): - for f in cvfolds: - f.update(i, obj) + for fold in cvfolds: + fold.update(i, obj) res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) sys.stderr.write(res + '\n') results.append(res) @@ -857,16 +977,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), # used for compatiblity without sklearn XGBModelBase = object -XGBClassifier = object -XGBRegressor = object +XGBClassifierBase = object +XGBRegressorBase = object if SKLEARN_INSTALLED: XGBModelBase = BaseEstimator - XGBRegressor = RegressorMixin - XGBClassifier = ClassifierMixin + XGBRegressorBase = RegressorMixin + XGBClassifierBase = ClassifierMixin class XGBModel(XGBModelBase): - """ - Implementation of the Scikit-Learn API for XGBoost. + # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name + """Implementation of the Scikit-Learn API for XGBoost. Parameters ---------- @@ -902,8 +1022,10 @@ class XGBModel(XGBModelBase): Value in the data which needs to be present as a missing value. If None, defaults to np.nan. """ - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, + def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, + silent=True, objective="reg:linear", + nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, + subsample=1, colsample_bytree=1, base_score=0.5, seed=0, missing=None): if not SKLEARN_INSTALLED: raise XGBoostError('sklearn needs to be installed in order to use this module') @@ -923,7 +1045,6 @@ class XGBModel(XGBModelBase): self.base_score = base_score self.seed = seed self.missing = missing if missing is not None else np.nan - self._Booster = None def __setstate__(self, state): @@ -936,9 +1057,9 @@ class XGBModel(XGBModelBase): self.__dict__.update(state) def booster(self): - """ - get the underlying xgboost Booster of this model - will raise an exception when fit was not called + """Get the underlying xgboost Booster of this model. + + This will raise an exception when fit was not called Returns ------- @@ -949,12 +1070,14 @@ class XGBModel(XGBModelBase): return self._Booster def get_params(self, deep=False): + """Get parameter.s""" params = super(XGBModel, self).get_params(deep=deep) if params['missing'] is np.nan: params['missing'] = None # sklearn doesn't handle nan. see #4725 return params def get_xgb_params(self): + """Get xgboost type parameters.""" xgb_params = self.get_params() xgb_params['silent'] = 1 if self.silent else 0 @@ -963,30 +1086,39 @@ class XGBModel(XGBModelBase): xgb_params.pop('nthread', None) return xgb_params - def fit(self, X, y): - trainDmatrix = DMatrix(X, label=y, missing=self.missing) - self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators) + def fit(self, data, y): + # pylint: disable=missing-docstring,invalid-name + train_dmatrix = DMatrix(data, label=y, missing=self.missing) + self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators) return self - def predict(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - return self.booster().predict(testDmatrix) + def predict(self, data): + # pylint: disable=missing-docstring,invalid-name + test_dmatrix = DMatrix(data, missing=self.missing) + return self.booster().predict(test_dmatrix) -class XGBClassifier(XGBModel, XGBClassifier): +class XGBClassifier(XGBModel, XGBClassifierBase): + # pylint: disable=missing-docstring,too-many-arguments,invalid-name __doc__ = """ Implementation of the scikit-learn API for XGBoost classification """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, + def __init__(self, max_depth=3, learning_rate=0.1, + n_estimators=100, silent=True, + objective="binary:logistic", + nthread=-1, gamma=0, min_child_weight=1, + max_delta_step=0, subsample=1, colsample_bytree=1, base_score=0.5, seed=0, missing=None): - super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, - nthread, gamma, min_child_weight, max_delta_step, subsample, + super(XGBClassifier, self).__init__(max_depth, learning_rate, + n_estimators, silent, objective, + nthread, gamma, min_child_weight, + max_delta_step, subsample, colsample_bytree, base_score, seed, missing) def fit(self, X, y, sample_weight=None): + # pylint: disable = attribute-defined-outside-init,arguments-differ self.classes_ = list(np.unique(y)) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: @@ -1001,29 +1133,29 @@ class XGBClassifier(XGBModel, XGBClassifier): training_labels = self._le.transform(y) if sample_weight is not None: - trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight, - missing=self.missing) + train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, + missing=self.missing) else: - trainDmatrix = DMatrix(X, label=training_labels, - missing=self.missing) + train_dmatrix = DMatrix(X, label=training_labels, + missing=self.missing) - self._Booster = train(xgb_options, trainDmatrix, self.n_estimators) + self._Booster = train(xgb_options, train_dmatrix, self.n_estimators) return self - def predict(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - class_probs = self.booster().predict(testDmatrix) + def predict(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) else: - column_indexes = np.repeat(0, X.shape[0]) + column_indexes = np.repeat(0, data.shape[0]) column_indexes[class_probs > 0.5] = 1 return self._le.inverse_transform(column_indexes) - def predict_proba(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - class_probs = self.booster().predict(testDmatrix) + def predict_proba(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) if self.objective == "multi:softprob": return class_probs else: @@ -1031,9 +1163,8 @@ class XGBClassifier(XGBModel, XGBClassifier): classzero_probs = 1.0 - classone_probs return np.vstack((classzero_probs, classone_probs)).transpose() -class XGBRegressor(XGBModel, XGBRegressor): +class XGBRegressor(XGBModel, XGBRegressorBase): + # pylint: disable=missing-docstring __doc__ = """ Implementation of the scikit-learn API for XGBoost regression """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - - pass diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 4d7828faf..fb33d0392 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -1,3 +1,4 @@ +// Copyright (c) 2014 by Contributors // implementations in ctypes #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE @@ -7,6 +8,7 @@ #include #include #include +#include // include all std functions using namespace std; #include "./xgboost_wrapper.h" @@ -31,9 +33,11 @@ class Booster: public learner::BoostLearner { this->init_model = false; this->SetCacheData(mats); } - inline const float *Pred(const DataMatrix &dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) { + inline const float *Pred(const DataMatrix &dmat, int option_mask, + unsigned ntree_limit, bst_ulong *len) { this->CheckInitModel(); - this->Predict(dmat, (option_mask&1) != 0, &this->preds_, ntree_limit, (option_mask&2) != 0); + this->Predict(dmat, (option_mask&1) != 0, &this->preds_, + ntree_limit, (option_mask&2) != 0); *len = static_cast(this->preds_.size()); return BeginPtr(this->preds_); } @@ -57,9 +61,9 @@ class Booster: public learner::BoostLearner { this->init_model = true; } inline void LoadModelFromBuffer(const void *buf, size_t size) { - utils::MemoryFixSizeBuffer fs((void*)buf, size); + utils::MemoryFixSizeBuffer fs((void*)buf, size); // NOLINT(*) learner::BoostLearner::LoadModel(fs, true); - this->init_model = true; + this->init_model = true; } inline const char *GetModelRaw(bst_ulong *out_len) { this->CheckInitModel(); @@ -94,251 +98,461 @@ class Booster: public learner::BoostLearner { private: bool init_model; }; + +// helper to support threadlocal +struct ThreadLocalStore { + std::vector data; + // allocate a string + inline std::string *Alloc() { + mutex.Lock(); + data.push_back(new std::string()); + std::string *ret = data.back(); + mutex.Unlock(); + return ret; + } + ThreadLocalStore() { + mutex.Init(); + } + ~ThreadLocalStore() { + for (size_t i = 0; i < data.size(); ++i) { + delete data[i]; + } + mutex.Destroy(); + } + utils::Mutex mutex; +}; + +static ThreadLocalStore thread_local_store; } // namespace wrapper } // namespace xgboost using namespace xgboost::wrapper; -extern "C"{ - void* XGDMatrixCreateFromFile(const char *fname, int silent) { - return LoadDataMatrix(fname, silent != 0, false, false); +/*! \brief macro to guard beginning and end section of all functions */ +#define API_BEGIN() try { +/*! + * \brief every function starts with API_BEGIN(); and finishes with API_END(); + * \param Finalize optionally put in a finalizer + */ +#define API_END_FINALIZE(Finalize) } catch(std::exception &e) { \ + Finalize; return XGBHandleException(e); \ + } return 0; +/*! \brief API End with no finalization */ +#define API_END() API_END_FINALIZE(;) + +// do not use threadlocal on OSX since it is not always available +#ifndef DISABLE_THREAD_LOCAL +#ifdef __GNUC__ + #define XGB_TREAD_LOCAL __thread +#elif __STDC_VERSION__ >= 201112L + #define XGB_TREAD_LOCAL _Thread_local +#elif defined(_MSC_VER) + #define XGB_TREAD_LOCAL __declspec(thread) +#endif +#endif + +#ifndef XGB_TREAD_LOCAL +#pragma message("Warning: Threadlocal not enabled, used single thread error handling") +#define XGB_TREAD_LOCAL +#endif + +/*! + * \brief a helper function for error handling + * will set the last error to be str_set when it is not NULL + * \param str_set the error to set + * \return a pointer message to last error + */ +const char *XGBSetGetLastError_(const char *str_set) { + // use last_error to record last error + static XGB_TREAD_LOCAL std::string *last_error = NULL; + if (last_error == NULL) { + last_error = thread_local_store.Alloc(); } - void* XGDMatrixCreateFromCSR(const bst_ulong *indptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem) { - DMatrixSimple *p_mat = new DMatrixSimple(); - DMatrixSimple &mat = *p_mat; - mat.row_ptr_.resize(nindptr); - for (bst_ulong i = 0; i < nindptr; ++i) { - mat.row_ptr_[i] = static_cast(indptr[i]); - } - mat.row_data_.resize(nelem); - for (bst_ulong i = 0; i < nelem; ++i) { - mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); - mat.info.info.num_col = std::max(mat.info.info.num_col, - static_cast(indices[i]+1)); - } - mat.info.info.num_row = nindptr - 1; - return p_mat; + if (str_set != NULL) { + *last_error = str_set; } - XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem) { - int nthread; - #pragma omp parallel - { - nthread = omp_get_num_threads(); - } - - DMatrixSimple *p_mat = new DMatrixSimple(); - DMatrixSimple &mat = *p_mat; - utils::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); - builder.InitBudget(0, nthread); - long ncol = static_cast(nindptr - 1); - #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { - int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.AddBudget(indices[j], tid); - } - } - builder.InitStorage(); - #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { - int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.Push(indices[j], - RowBatch::Entry(static_cast(i), data[j]), - tid); - } - } - mat.info.info.num_row = mat.row_ptr_.size() - 1; - mat.info.info.num_col = static_cast(ncol); - return p_mat; + return last_error->c_str(); +} + +/*! \brief return str message of the last error */ +const char *XGBGetLastError() { + return XGBSetGetLastError_(NULL); +} + +/*! + * \brief handle exception throwed out + * \param e the exception + * \return the return value of API after exception is handled + */ +int XGBHandleException(const std::exception &e) { + XGBSetGetLastError_(e.what()); + return -1; +} + +int XGDMatrixCreateFromFile(const char *fname, + int silent, + DMatrixHandle *out) { + API_BEGIN(); + *out = LoadDataMatrix(fname, silent != 0, false, false); + API_END(); +} + +int XGDMatrixCreateFromCSR(const bst_ulong *indptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle *out) { + DMatrixSimple *p_mat = NULL; + API_BEGIN(); + p_mat = new DMatrixSimple(); + DMatrixSimple &mat = *p_mat; + mat.row_ptr_.resize(nindptr); + for (bst_ulong i = 0; i < nindptr; ++i) { + mat.row_ptr_[i] = static_cast(indptr[i]); } - void* XGDMatrixCreateFromMat(const float *data, - bst_ulong nrow, - bst_ulong ncol, - float missing) { - bool nan_missing = utils::CheckNAN(missing); - DMatrixSimple *p_mat = new DMatrixSimple(); - DMatrixSimple &mat = *p_mat; - mat.info.info.num_row = nrow; - mat.info.info.num_col = ncol; - for (bst_ulong i = 0; i < nrow; ++i, data += ncol) { - bst_ulong nelem = 0; - for (bst_ulong j = 0; j < ncol; ++j) { - if (utils::CheckNAN(data[j])) { - utils::Check(nan_missing, - "There are NAN in the matrix, however, you did not set missing=NAN"); - } else { - if (nan_missing || data[j] != missing) { - mat.row_data_.push_back(RowBatch::Entry(j, data[j])); - ++nelem; - } + mat.row_data_.resize(nelem); + for (bst_ulong i = 0; i < nelem; ++i) { + mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); + mat.info.info.num_col = std::max(mat.info.info.num_col, + static_cast(indices[i]+1)); + } + mat.info.info.num_row = nindptr - 1; + *out = p_mat; + API_END_FINALIZE(delete p_mat); +} + +int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle *out) { + DMatrixSimple *p_mat = NULL; + API_BEGIN(); + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + p_mat = new DMatrixSimple(); + DMatrixSimple &mat = *p_mat; + utils::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); + builder.InitBudget(0, nthread); + long ncol = static_cast(nindptr - 1); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (long i = 0; i < ncol; ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.AddBudget(indices[j], tid); + } + } + builder.InitStorage(); + #pragma omp parallel for schedule(static) + for (long i = 0; i < ncol; ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.Push(indices[j], + RowBatch::Entry(static_cast(i), data[j]), + tid); + } + } + mat.info.info.num_row = mat.row_ptr_.size() - 1; + mat.info.info.num_col = static_cast(ncol); + *out = p_mat; + API_END_FINALIZE(delete p_mat); +} + +int XGDMatrixCreateFromMat(const float *data, + bst_ulong nrow, + bst_ulong ncol, + float missing, + DMatrixHandle *out) { + DMatrixSimple *p_mat = NULL; + API_BEGIN(); + p_mat = new DMatrixSimple(); + bool nan_missing = utils::CheckNAN(missing); + DMatrixSimple &mat = *p_mat; + mat.info.info.num_row = nrow; + mat.info.info.num_col = ncol; + for (bst_ulong i = 0; i < nrow; ++i, data += ncol) { + bst_ulong nelem = 0; + for (bst_ulong j = 0; j < ncol; ++j) { + if (utils::CheckNAN(data[j])) { + utils::Check(nan_missing, + "There are NAN in the matrix, however, you did not set missing=NAN"); + } else { + if (nan_missing || data[j] != missing) { + mat.row_data_.push_back(RowBatch::Entry(j, data[j])); + ++nelem; } } - mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem); } - return p_mat; - } - void* XGDMatrixSliceDMatrix(void *handle, - const int *idxset, - bst_ulong len) { - DMatrixSimple tmp; - DataMatrix &dsrc = *static_cast(handle); - if (dsrc.magic != DMatrixSimple::kMagic) { - tmp.CopyFrom(dsrc); - } - DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ? - *static_cast(handle): tmp); - DMatrixSimple *p_ret = new DMatrixSimple(); - DMatrixSimple &ret = *p_ret; - - utils::Check(src.info.group_ptr.size() == 0, - "slice does not support group structure"); - ret.Clear(); - ret.info.info.num_row = len; - ret.info.info.num_col = src.info.num_col(); - - utils::IIterator *iter = src.fmat()->RowIterator(); - iter->BeforeFirst(); - utils::Assert(iter->Next(), "slice"); - const RowBatch &batch = iter->Value(); - for (bst_ulong i = 0; i < len; ++i) { - const int ridx = idxset[i]; - RowBatch::Inst inst = batch[ridx]; - utils::Check(static_cast(ridx) < batch.size, "slice index exceed number of rows"); - ret.row_data_.resize(ret.row_data_.size() + inst.length); - memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, - sizeof(RowBatch::Entry) * inst.length); - ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); - if (src.info.labels.size() != 0) { - ret.info.labels.push_back(src.info.labels[ridx]); - } - if (src.info.weights.size() != 0) { - ret.info.weights.push_back(src.info.weights[ridx]); - } - if (src.info.info.root_index.size() != 0) { - ret.info.info.root_index.push_back(src.info.info.root_index[ridx]); - } - if (src.info.info.fold_index.size() != 0) { - ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]); - } - } - return p_ret; - } - void XGDMatrixFree(void *handle) { - delete static_cast(handle); - } - void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) { - SaveDataMatrix(*static_cast(handle), fname, silent != 0); - } - void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) { - std::vector &vec = - static_cast(handle)->info.GetFloatInfo(field); - vec.resize(len); - memcpy(BeginPtr(vec), info, sizeof(float) * len); - } - void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) { - std::vector &vec = - static_cast(handle)->info.GetUIntInfo(field); - vec.resize(len); - memcpy(BeginPtr(vec), info, sizeof(unsigned) * len); - } - void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len) { - DataMatrix *pmat = static_cast(handle); - pmat->info.group_ptr.resize(len + 1); - pmat->info.group_ptr[0] = 0; - for (uint64_t i = 0; i < len; ++i) { - pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i] + group[i]; - } - } - const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* len) { - const std::vector &vec = - static_cast(handle)->info.GetFloatInfo(field); - *len = static_cast(vec.size()); - return BeginPtr(vec); - } - const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) { - const std::vector &vec = - static_cast(handle)->info.GetUIntInfo(field); - *len = static_cast(vec.size()); - return BeginPtr(vec); - } - bst_ulong XGDMatrixNumRow(const void *handle) { - return static_cast(static_cast(handle)->info.num_row()); - } - - // xgboost implementation - void *XGBoosterCreate(void *dmats[], bst_ulong len) { - std::vector mats; - for (bst_ulong i = 0; i < len; ++i) { - DataMatrix *dtr = static_cast(dmats[i]); - mats.push_back(dtr); - } - return new Booster(mats); - } - void XGBoosterFree(void *handle) { - delete static_cast(handle); - } - void XGBoosterSetParam(void *handle, const char *name, const char *value) { - static_cast(handle)->SetParam(name, value); - } - void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain) { - Booster *bst = static_cast(handle); - DataMatrix *dtr = static_cast(dtrain); - bst->CheckInitModel(); - bst->CheckInit(dtr); - bst->UpdateOneIter(iter, *dtr); - } - void XGBoosterBoostOneIter(void *handle, void *dtrain, - float *grad, float *hess, bst_ulong len) { - Booster *bst = static_cast(handle); - DataMatrix *dtr = static_cast(dtrain); - bst->CheckInitModel(); - bst->CheckInit(dtr); - bst->BoostOneIter(*dtr, grad, hess, len); - } - const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[], - const char *evnames[], bst_ulong len) { - Booster *bst = static_cast(handle); - std::vector names; - std::vector mats; - for (bst_ulong i = 0; i < len; ++i) { - mats.push_back(static_cast(dmats[i])); - names.push_back(std::string(evnames[i])); - } - bst->CheckInitModel(); - bst->eval_str = bst->EvalOneIter(iter, mats, names); - return bst->eval_str.c_str(); - } - const float *XGBoosterPredict(void *handle, void *dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) { - return static_cast(handle)->Pred(*static_cast(dmat), option_mask, ntree_limit, len); - } - void XGBoosterLoadModel(void *handle, const char *fname) { - static_cast(handle)->LoadModel(fname); - } - void XGBoosterSaveModel(void *handle, const char *fname) { - Booster *bst = static_cast(handle); - bst->CheckInitModel(); - bst->SaveModel(fname, false); - } - void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) { - static_cast(handle)->LoadModelFromBuffer(buf, len); - } - const char *XGBoosterGetModelRaw(void *handle, bst_ulong *out_len) { - return static_cast(handle)->GetModelRaw(out_len); - } - const char** XGBoosterDumpModel(void *handle, const char *fmap, int with_stats, bst_ulong *len){ - utils::FeatMap featmap; - if (strlen(fmap) != 0) { - featmap.LoadText(fmap); - } - return static_cast(handle)->GetModelDump(featmap, with_stats != 0, len); + mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem); } + *out = p_mat; + API_END_FINALIZE(delete p_mat); +} + +int XGDMatrixSliceDMatrix(DMatrixHandle handle, + const int *idxset, + bst_ulong len, + DMatrixHandle *out) { + DMatrixSimple *p_ret = NULL; + API_BEGIN(); + DMatrixSimple tmp; + DataMatrix &dsrc = *static_cast(handle); + if (dsrc.magic != DMatrixSimple::kMagic) { + tmp.CopyFrom(dsrc); + } + DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ? + *static_cast(handle): tmp); + p_ret = new DMatrixSimple(); + DMatrixSimple &ret = *p_ret; + + utils::Check(src.info.group_ptr.size() == 0, + "slice does not support group structure"); + ret.Clear(); + ret.info.info.num_row = len; + ret.info.info.num_col = src.info.num_col(); + + utils::IIterator *iter = src.fmat()->RowIterator(); + iter->BeforeFirst(); + utils::Assert(iter->Next(), "slice"); + const RowBatch &batch = iter->Value(); + for (bst_ulong i = 0; i < len; ++i) { + const int ridx = idxset[i]; + RowBatch::Inst inst = batch[ridx]; + utils::Check(static_cast(ridx) < batch.size, "slice index exceed number of rows"); + ret.row_data_.resize(ret.row_data_.size() + inst.length); + memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, + sizeof(RowBatch::Entry) * inst.length); + ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); + if (src.info.labels.size() != 0) { + ret.info.labels.push_back(src.info.labels[ridx]); + } + if (src.info.weights.size() != 0) { + ret.info.weights.push_back(src.info.weights[ridx]); + } + if (src.info.info.root_index.size() != 0) { + ret.info.info.root_index.push_back(src.info.info.root_index[ridx]); + } + if (src.info.info.fold_index.size() != 0) { + ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]); + } + } + *out = p_ret; + API_END_FINALIZE(delete p_ret); +} + +int XGDMatrixFree(DMatrixHandle handle) { + API_BEGIN(); + delete static_cast(handle); + API_END(); +} + +int XGDMatrixSaveBinary(DMatrixHandle handle, + const char *fname, + int silent) { + API_BEGIN(); + SaveDataMatrix(*static_cast(handle), fname, silent != 0); + API_END(); +} + +int XGDMatrixSetFloatInfo(DMatrixHandle handle, + const char *field, + const float *info, + bst_ulong len) { + API_BEGIN(); + std::vector &vec = + static_cast(handle)->info.GetFloatInfo(field); + vec.resize(len); + memcpy(BeginPtr(vec), info, sizeof(float) * len); + API_END(); +} + +int XGDMatrixSetUIntInfo(DMatrixHandle handle, + const char *field, + const unsigned *info, + bst_ulong len) { + API_BEGIN(); + std::vector &vec = + static_cast(handle)->info.GetUIntInfo(field); + vec.resize(len); + memcpy(BeginPtr(vec), info, sizeof(unsigned) * len); + API_END(); +} + +int XGDMatrixSetGroup(DMatrixHandle handle, + const unsigned *group, + bst_ulong len) { + API_BEGIN(); + DataMatrix *pmat = static_cast(handle); + pmat->info.group_ptr.resize(len + 1); + pmat->info.group_ptr[0] = 0; + for (uint64_t i = 0; i < len; ++i) { + pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i] + group[i]; + } + API_END(); +} + +int XGDMatrixGetFloatInfo(const DMatrixHandle handle, + const char *field, + bst_ulong *out_len, + const float **out_dptr) { + API_BEGIN(); + const std::vector &vec = + static_cast(handle)->info.GetFloatInfo(field); + *out_len = static_cast(vec.size()); + *out_dptr = BeginPtr(vec); + API_END(); +} + +int XGDMatrixGetUIntInfo(const DMatrixHandle handle, + const char *field, + bst_ulong *out_len, + const unsigned **out_dptr) { + API_BEGIN(); + const std::vector &vec = + static_cast(handle)->info.GetUIntInfo(field); + *out_len = static_cast(vec.size()); + *out_dptr = BeginPtr(vec); + API_END(); +} +int XGDMatrixNumRow(const DMatrixHandle handle, + bst_ulong *out) { + API_BEGIN(); + *out = static_cast(static_cast(handle)->info.num_row()); + API_END(); +} + +// xgboost implementation +int XGBoosterCreate(DMatrixHandle dmats[], + bst_ulong len, + BoosterHandle *out) { + API_BEGIN(); + std::vector mats; + for (bst_ulong i = 0; i < len; ++i) { + DataMatrix *dtr = static_cast(dmats[i]); + mats.push_back(dtr); + } + *out = new Booster(mats); + API_END(); +} + +int XGBoosterFree(BoosterHandle handle) { + API_BEGIN(); + delete static_cast(handle); + API_END(); +} + +int XGBoosterSetParam(BoosterHandle handle, + const char *name, const char *value) { + API_BEGIN(); + static_cast(handle)->SetParam(name, value); + API_END(); +} + +int XGBoosterUpdateOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dtrain) { + API_BEGIN(); + Booster *bst = static_cast(handle); + DataMatrix *dtr = static_cast(dtrain); + bst->CheckInitModel(); + bst->CheckInit(dtr); + bst->UpdateOneIter(iter, *dtr); + API_END(); +} + +int XGBoosterBoostOneIter(BoosterHandle handle, + DMatrixHandle dtrain, + float *grad, + float *hess, + bst_ulong len) { + API_BEGIN(); + Booster *bst = static_cast(handle); + DataMatrix *dtr = static_cast(dtrain); + bst->CheckInitModel(); + bst->CheckInit(dtr); + bst->BoostOneIter(*dtr, grad, hess, len); + API_END(); +} + +int XGBoosterEvalOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dmats[], + const char *evnames[], + bst_ulong len, + const char **out_str) { + API_BEGIN(); + Booster *bst = static_cast(handle); + std::vector names; + std::vector mats; + for (bst_ulong i = 0; i < len; ++i) { + mats.push_back(static_cast(dmats[i])); + names.push_back(std::string(evnames[i])); + } + bst->CheckInitModel(); + bst->eval_str = bst->EvalOneIter(iter, mats, names); + *out_str = bst->eval_str.c_str(); + API_END(); +} + +int XGBoosterPredict(BoosterHandle handle, + DMatrixHandle dmat, + int option_mask, + unsigned ntree_limit, + bst_ulong *len, + const float **out_result) { + API_BEGIN(); + *out_result = static_cast(handle)-> + Pred(*static_cast(dmat), + option_mask, ntree_limit, len); + API_END(); +} + +int XGBoosterLoadModel(BoosterHandle handle, const char *fname) { + API_BEGIN(); + static_cast(handle)->LoadModel(fname); + API_END(); +} + +int XGBoosterSaveModel(BoosterHandle handle, const char *fname) { + API_BEGIN(); + Booster *bst = static_cast(handle); + bst->CheckInitModel(); + bst->SaveModel(fname, false); + API_END(); +} + +int XGBoosterLoadModelFromBuffer(BoosterHandle handle, + const void *buf, + bst_ulong len) { + API_BEGIN(); + static_cast(handle)->LoadModelFromBuffer(buf, len); + API_END(); +} + +int XGBoosterGetModelRaw(BoosterHandle handle, + bst_ulong *out_len, + const char **out_dptr) { + API_BEGIN(); + *out_dptr = static_cast(handle)->GetModelRaw(out_len); + API_END(); +} + +int XGBoosterDumpModel(BoosterHandle handle, + const char *fmap, + int with_stats, + bst_ulong *len, + const char ***out_models) { + API_BEGIN(); + utils::FeatMap featmap; + if (strlen(fmap) != 0) { + featmap.LoadText(fmap); + } + *out_models = static_cast(handle)->GetModelDump( + featmap, with_stats != 0, len); + API_END(); } diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 88a327d0d..6d3a619fb 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -1,235 +1,327 @@ -#ifndef XGBOOST_WRAPPER_H_ -#define XGBOOST_WRAPPER_H_ /*! + * Copyright (c) 2014 by Contributors * \file xgboost_wrapper.h * \author Tianqi Chen * \brief a C style wrapper of xgboost * can be used to create wrapper of other languages */ -#if defined(_MSC_VER) || defined(_WIN32) -#define XGB_DLL __declspec(dllexport) -#else -#define XGB_DLL -#endif -// manually define unsign long -typedef unsigned long bst_ulong; +#ifndef XGBOOST_WRAPPER_H_ +#define XGBOOST_WRAPPER_H_ #ifdef __cplusplus -extern "C" { +#define XGB_EXTERN_C extern "C" #endif - /*! - * \brief load a data matrix - * \param fname the name of the file - * \param silent whether print messages during loading - * \return a loaded data matrix - */ - XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent); - /*! - * \brief create a matrix content from csr format - * \param indptr pointer to row headers - * \param indices findex - * \param data fvalue - * \param nindptr number of rows in the matix + 1 - * \param nelem number of nonzero elements in the matrix - * \return created dmatrix - */ - XGB_DLL void* XGDMatrixCreateFromCSR(const bst_ulong *indptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem); - /*! - * \brief create a matrix content from CSC format - * \param col_ptr pointer to col headers - * \param indices findex - * \param data fvalue - * \param nindptr number of rows in the matix + 1 - * \param nelem number of nonzero elements in the matrix - * \return created dmatrix - */ - XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem); - /*! - * \brief create matrix content from dense matrix - * \param data pointer to the data space - * \param nrow number of rows - * \param ncol number columns - * \param missing which value to represent missing value - * \return created dmatrix - */ - XGB_DLL void* XGDMatrixCreateFromMat(const float *data, - bst_ulong nrow, - bst_ulong ncol, - float missing); - /*! - * \brief create a new dmatrix from sliced content of existing matrix - * \param handle instance of data matrix to be sliced - * \param idxset index set - * \param len length of index set - * \return a sliced new matrix - */ - XGB_DLL void* XGDMatrixSliceDMatrix(void *handle, - const int *idxset, - bst_ulong len); - /*! - * \brief free space in data matrix - */ - XGB_DLL void XGDMatrixFree(void *handle); - /*! - * \brief load a data matrix into binary file - * \param handle a instance of data matrix - * \param fname file name - * \param silent print statistics when saving - */ - XGB_DLL void XGDMatrixSaveBinary(void *handle, const char *fname, int silent); - /*! - * \brief set float vector to a content in info - * \param handle a instance of data matrix - * \param field field name, can be label, weight - * \param array pointer to float vector - * \param len length of array - */ - XGB_DLL void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, bst_ulong len); - /*! - * \brief set uint32 vector to a content in info - * \param handle a instance of data matrix - * \param field field name - * \param array pointer to float vector - * \param len length of array - */ - XGB_DLL void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, bst_ulong len); - /*! - * \brief set label of the training matrix - * \param handle a instance of data matrix - * \param group pointer to group size - * \param len length of array - */ - XGB_DLL void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len); - /*! - * \brief get float info vector from matrix - * \param handle a instance of data matrix - * \param field field name - * \param out_len used to set result length - * \return pointer to the result - */ - XGB_DLL const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* out_len); - /*! - * \brief get uint32 info vector from matrix - * \param handle a instance of data matrix - * \param field field name - * \param out_len used to set result length - * \return pointer to the result - */ - XGB_DLL const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* out_len); - /*! - * \brief return number of rows - */ - XGB_DLL bst_ulong XGDMatrixNumRow(const void *handle); - // --- start XGBoost class - /*! - * \brief create xgboost learner - * \param dmats matrices that are set to be cached - * \param len length of dmats - */ - XGB_DLL void *XGBoosterCreate(void* dmats[], bst_ulong len); - /*! - * \brief free obj in handle - * \param handle handle to be freed - */ - XGB_DLL void XGBoosterFree(void* handle); - /*! - * \brief set parameters - * \param handle handle - * \param name parameter name - * \param val value of parameter - */ - XGB_DLL void XGBoosterSetParam(void *handle, const char *name, const char *value); - /*! - * \brief update the model in one round using dtrain - * \param handle handle - * \param iter current iteration rounds - * \param dtrain training data - */ - XGB_DLL void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain); - /*! - * \brief update the model, by directly specify gradient and second order gradient, - * this can be used to replace UpdateOneIter, to support customized loss function - * \param handle handle - * \param dtrain training data - * \param grad gradient statistics - * \param hess second order gradient statistics - * \param len length of grad/hess array - */ - XGB_DLL void XGBoosterBoostOneIter(void *handle, void *dtrain, - float *grad, float *hess, bst_ulong len); - /*! - * \brief get evaluation statistics for xgboost - * \param handle handle - * \param iter current iteration rounds - * \param dmats pointers to data to be evaluated - * \param evnames pointers to names of each data - * \param len length of dmats - * \return the string containing evaluation stati - */ - XGB_DLL const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[], - const char *evnames[], bst_ulong len); - /*! - * \brief make prediction based on dmat - * \param handle handle - * \param dmat data matrix - * \param option_mask bit-mask of options taken in prediction, possible values - * 0:normal prediction - * 1:output margin instead of transformed value - * 2:output leaf index of trees instead of leaf value, note leaf index is unique per tree - * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees - * when the parameter is set to 0, we will use all the trees - * \param len used to store length of returning result - */ - XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, - int option_mask, - unsigned ntree_limit, - bst_ulong *len); - /*! - * \brief load model from existing file - * \param handle handle - * \param fname file name - */ - XGB_DLL void XGBoosterLoadModel(void *handle, const char *fname); - /*! - * \brief save model into existing file - * \param handle handle - * \param fname file name - */ - XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname); - /*! - * \brief load model from in memory buffer - * \param handle handle - * \param buf pointer to the buffer - * \param len the length of the buffer - */ - XGB_DLL void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len); - /*! - * \brief save model into binary raw bytes, return header of the array - * user must copy the result out, before next xgboost call - * \param handle handle - * \param out_len the argument to hold the output length - * \return the pointer to the beginning of binary buffer - */ - XGB_DLL const char *XGBoosterGetModelRaw(void *handle, bst_ulong *out_len); - /*! - * \brief dump model, return array of strings representing model dump - * \param handle handle - * \param fmap name to fmap can be empty string - * \param with_stats whether to dump with statistics - * \param out_len length of output array - * \return char *data[], representing dump of each model - */ - XGB_DLL const char **XGBoosterDumpModel(void *handle, const char *fmap, - int with_stats, bst_ulong *out_len); -#ifdef __cplusplus -} + +#if defined(_MSC_VER) || defined(_WIN32) +#define XGB_DLL XGB_EXTERN_C __declspec(dllexport) +#else +#define XGB_DLL XGB_EXTERN_C #endif +// manually define unsign long +typedef unsigned long bst_ulong; // NOLINT(*) + +/*! \brief handle to DMatrix */ +typedef void *DMatrixHandle; +/*! \brief handle to Booster */ +typedef void *BoosterHandle; + +/*! + * \brief get string message of the last error + * + * all function in this file will return 0 when success + * and -1 when an error occured, + * XGBGetLastError can be called to retrieve the error + * + * this function is threadsafe and can be called by different thread + * \return const char* error inforomation + */ +XGB_DLL const char *XGBGetLastError(); + +/*! + * \brief load a data matrix + * \param fname the name of the file + * \param silent whether print messages during loading + * \param out a loaded data matrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromFile(const char *fname, + int silent, + DMatrixHandle *out); + +/*! + * \brief create a matrix content from csr format + * \param indptr pointer to row headers + * \param indices findex + * \param data fvalue + * \param nindptr number of rows in the matix + 1 + * \param nelem number of nonzero elements in the matrix + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromCSR(const bst_ulong *indptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle *out); +/*! + * \brief create a matrix content from CSC format + * \param col_ptr pointer to col headers + * \param indices findex + * \param data fvalue + * \param nindptr number of rows in the matix + 1 + * \param nelem number of nonzero elements in the matrix + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle *out); +/*! + * \brief create matrix content from dense matrix + * \param data pointer to the data space + * \param nrow number of rows + * \param ncol number columns + * \param missing which value to represent missing value + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromMat(const float *data, + bst_ulong nrow, + bst_ulong ncol, + float missing, + DMatrixHandle *out); +/*! + * \brief create a new dmatrix from sliced content of existing matrix + * \param handle instance of data matrix to be sliced + * \param idxset index set + * \param len length of index set + * \param out a sliced new matrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, + const int *idxset, + bst_ulong len, + DMatrixHandle *out); +/*! + * \brief free space in data matrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixFree(void *handle); +/*! + * \brief load a data matrix into binary file + * \param handle a instance of data matrix + * \param fname file name + * \param silent print statistics when saving + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle, + const char *fname, int silent); +/*! + * \brief set float vector to a content in info + * \param handle a instance of data matrix + * \param field field name, can be label, weight + * \param array pointer to float vector + * \param len length of array + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, + const char *field, + const float *array, + bst_ulong len); +/*! + * \brief set uint32 vector to a content in info + * \param handle a instance of data matrix + * \param field field name + * \param array pointer to float vector + * \param len length of array + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, + const char *field, + const unsigned *array, + bst_ulong len); +/*! + * \brief set label of the training matrix + * \param handle a instance of data matrix + * \param group pointer to group size + * \param len length of array + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, + const unsigned *group, + bst_ulong len); +/*! + * \brief get float info vector from matrix + * \param handle a instance of data matrix + * \param field field name + * \param out_len used to set result length + * \param out_dptr pointer to the result + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, + const char *field, + bst_ulong* out_len, + const float **out_dptr); +/*! + * \brief get uint32 info vector from matrix + * \param handle a instance of data matrix + * \param field field name + * \param out_ptr pointer to the result + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle, + const char *field, + bst_ulong* out_len, + const unsigned **out_dptr); +/*! + * \brief get number of rows + * \param handle the handle to the DMatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, + bst_ulong *out); +// --- start XGBoost class +/*! + * \brief create xgboost learner + * \param dmats matrices that are set to be cached + * \param len length of dmats + * \param out handle to the result booster + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterCreate(void* dmats[], + bst_ulong len, + BoosterHandle *out); +/*! + * \brief free obj in handle + * \param handle handle to be freed + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterFree(BoosterHandle handle); + +/*! + * \brief set parameters + * \param handle handle + * \param name parameter name + * \param val value of parameter + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterSetParam(BoosterHandle handle, + const char *name, + const char *value); +/*! + * \brief update the model in one round using dtrain + * \param handle handle + * \param iter current iteration rounds + * \param dtrain training data + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dtrain); +/*! + * \brief update the model, by directly specify gradient and second order gradient, + * this can be used to replace UpdateOneIter, to support customized loss function + * \param handle handle + * \param dtrain training data + * \param grad gradient statistics + * \param hess second order gradient statistics + * \param len length of grad/hess array + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, + DMatrixHandle dtrain, + float *grad, + float *hess, + bst_ulong len); +/*! + * \brief get evaluation statistics for xgboost + * \param handle handle + * \param iter current iteration rounds + * \param dmats pointers to data to be evaluated + * \param evnames pointers to names of each data + * \param len length of dmats + * \param out_result the string containing evaluation statistics + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dmats[], + const char *evnames[], + bst_ulong len, + const char **out_result); +/*! + * \brief make prediction based on dmat + * \param handle handle + * \param dmat data matrix + * \param option_mask bit-mask of options taken in prediction, possible values + * 0:normal prediction + * 1:output margin instead of transformed value + * 2:output leaf index of trees instead of leaf value, note leaf index is unique per tree + * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees + * when the parameter is set to 0, we will use all the trees + * \param out_len used to store length of returning result + * \param out_result used to set a pointer to array + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterPredict(BoosterHandle handle, + DMatrixHandle dmat, + int option_mask, + unsigned ntree_limit, + bst_ulong *out_len, + const float **out_result); +/*! + * \brief load model from existing file + * \param handle handle + * \param fname file name +* \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, + const char *fname); +/*! + * \brief save model into existing file + * \param handle handle + * \param fname file name + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, + const char *fname); +/*! + * \brief load model from in memory buffer + * \param handle handle + * \param buf pointer to the buffer + * \param len the length of the buffer + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, + const void *buf, + bst_ulong len); +/*! + * \brief save model into binary raw bytes, return header of the array + * user must copy the result out, before next xgboost call + * \param handle handle + * \param out_len the argument to hold the output length + * \param out_dptr the argument to hold the output data pointer + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, + bst_ulong *out_len, + const char **out_dptr); +/*! + * \brief dump model, return array of strings representing model dump + * \param handle handle + * \param fmap name to fmap can be empty string + * \param with_stats whether to dump with statistics + * \param out_len length of output array + * \param out_dump_array pointer to hold representing dump of each model + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterDumpModel(BoosterHandle handle, + const char *fmap, + int with_stats, + bst_ulong *out_len, + const char ***out_dump_array); #endif // XGBOOST_WRAPPER_H_