This commit is contained in:
pommedeterresautee 2015-06-16 21:40:09 +02:00
commit 37714eb331
42 changed files with 1072 additions and 298 deletions

View File

@ -21,8 +21,16 @@ xgboost-0.3
* Add [Code Guide](src/README.md) for customizing objective function and evaluation * Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module * Add R module
in progress version xgboost-0.4
===== =====
* Distributed version * Distributed version of xgboost that runs on YARN, scales to billions of examples
* Feature importance visualization in R module, thanks to Michael Benesty * Direct save/load data and model from/to S3 and HDFS
* Predict leaf inde * Feature importance visualization in R module, by Michael Benesty
* Predict leaf index
* Poisson regression for counts data
* Early stopping option in training
* Native save load support in R and python
- xgboost models now can be saved using save/load in R
- xgboost python model is now pickable
* sklearn wrapper is supported in python module
* Experimental External memory version

View File

@ -1,4 +1,4 @@
Copyright (c) 2014 by Tianqi Chen and Contributors Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

View File

@ -2,7 +2,7 @@ export CC = gcc
export CXX = g++ export CXX = g++
export MPICXX = mpicxx export MPICXX = mpicxx
export LDFLAGS= -pthread -lm export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
ifeq ($(OS), Windows_NT) ifeq ($(OS), Windows_NT)
export CXX = g++ -m64 export CXX = g++ -m64
@ -18,7 +18,6 @@ endif
# by default use c++11 # by default use c++11
ifeq ($(cxx11),1) ifeq ($(cxx11),1)
CFLAGS += -std=c++11 CFLAGS += -std=c++11
else
endif endif
# handling dmlc # handling dmlc
@ -38,6 +37,14 @@ else
LIBDMLC=dmlc_simple.o LIBDMLC=dmlc_simple.o
endif endif
ifndef WITH_FPIC
WITH_FPIC = 1
endif
ifeq ($(WITH_FPIC), 1)
CFLAGS += -fPIC
endif
ifeq ($(OS), Windows_NT) ifeq ($(OS), Windows_NT)
LIBRABIT = subtree/rabit/lib/librabit_empty.a LIBRABIT = subtree/rabit/lib/librabit_empty.a
SLIB = wrapper/xgboost_wrapper.dll SLIB = wrapper/xgboost_wrapper.dll
@ -51,11 +58,15 @@ BIN = xgboost
MOCKBIN = xgboost.mock MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
MPIBIN = MPIBIN =
ifeq ($(WITH_FPIC), 1)
TARGET = $(BIN) $(OBJ) $(SLIB) TARGET = $(BIN) $(OBJ) $(SLIB)
else
TARGET = $(BIN)
endif
.PHONY: clean all mpi python Rpack .PHONY: clean all mpi python Rpack
all: $(BIN) $(OBJ) $(SLIB) all: $(TARGET)
mpi: $(MPIBIN) mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so python: wrapper/libxgboostwrapper.so
@ -79,7 +90,7 @@ subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../.. + cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) : $(BIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) $(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(MOCKBIN) : $(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

View File

@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: eXtreme Gradient Boosting Title: eXtreme Gradient Boosting
Version: 0.3-4 Version: 0.4-0
Date: 2014-12-28 Date: 2015-05-11
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr> Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
Maintainer: Tong He <hetong007@gmail.com> Maintainer: Tong He <hetong007@gmail.com>
Description: Xgboost is short for eXtreme Gradient Boosting, which is an Description: Xgboost is short for eXtreme Gradient Boosting, which is an

View File

@ -54,6 +54,13 @@
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
#' If folds are supplied, the nfold and stratified parameters would be ignored. #' If folds are supplied, the nfold and stratified parameters would be ignored.
#' @param verbose \code{boolean}, print the statistics during the process #' @param verbose \code{boolean}, print the statistics during the process
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#'
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @return #' @return
@ -86,7 +93,8 @@
#' #'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
prediction = FALSE, showsd = TRUE, metrics=list(), prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) { obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
early.stop.round = NULL, maximize = NULL, ...) {
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list") stop("xgb.cv: first argument params must be list")
} }
@ -110,6 +118,49 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
params <- append(params, list("eval_metric"=mc)) params <- append(params, list("eval_metric"=mc))
} }
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.cv: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective)=='function') {
obj = params$objective
params$objective = NULL
}
if (!is.null(params$eval_metric) && !is.null(feval))
stop("xgb.cv: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric)=='function') {
feval = params$eval_metric
params$eval_metric = NULL
}
# Early Stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize = FALSE
} else {
maximize = TRUE
}
}
if (maximize) {
bestScore = 0
} else {
bestScore = Inf
}
bestInd = 0
earlyStopflag = FALSE
if (length(metrics)>1)
warning('Only the first metric is used for early stopping process.')
}
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
obj_type = params[['objective']] obj_type = params[['objective']]
mat_pred = FALSE mat_pred = FALSE
@ -124,6 +175,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
else else
predictValues <- rep(0,xgb.numrow(dtrain)) predictValues <- rep(0,xgb.numrow(dtrain))
history <- c() history <- c()
print.every.n = max(as.integer(print.every.n), 1L)
for (i in 1:nrounds) { for (i in 1:nrounds) {
msg <- list() msg <- list()
for (k in 1:nfold) { for (k in 1:nfold) {
@ -148,7 +200,27 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
} }
ret <- xgb.cv.aggcv(msg, showsd) ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret) history <- c(history, ret)
if(verbose) paste(ret, "\n", sep="") %>% cat if(verbose)
if (0==(i-1L)%%print.every.n)
cat(ret, "\n", sep="")
# early_Stopping
if (!is.null(early.stop.round)){
score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
score = strsplit(score,'\\+|:')[[1]][[2]]
score = as.numeric(score)
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
bestScore = score
bestInd = i
} else {
if (i-bestInd>=early.stop.round) {
earlyStopflag = TRUE
cat('Stopping. Best iteration:',bestInd)
break
}
}
}
} }
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")

View File

@ -36,7 +36,7 @@
#' 3. Task Parameters #' 3. Task Parameters
#' #'
#' \itemize{ #' \itemize{
#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: #' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
#' \itemize{ #' \itemize{
#' \item \code{reg:linear} linear regression (Default). #' \item \code{reg:linear} linear regression (Default).
#' \item \code{reg:logistic} logistic regression. #' \item \code{reg:logistic} logistic regression.
@ -48,7 +48,7 @@
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' } #' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. #' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
#' } #' }
#' #'
#' @param data takes an \code{xgb.DMatrix} as the input. #' @param data takes an \code{xgb.DMatrix} as the input.
@ -66,7 +66,12 @@
#' prediction and dtrain, #' prediction and dtrain,
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both #' information of performance. If 2, xgboost will print information of both
#' #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @details #' @details
@ -98,7 +103,6 @@
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- dtrain #' dtest <- dtrain
#' watchlist <- list(eval = dtest, train = dtrain) #' watchlist <- list(eval = dtest, train = dtrain)
#' param <- list(max.depth = 2, eta = 1, silent = 1)
#' logregobj <- function(preds, dtrain) { #' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label") #' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds)) #' preds <- 1/(1 + exp(-preds))
@ -111,11 +115,13 @@
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels) #' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
#' return(list(metric = "error", value = err)) #' return(list(metric = "error", value = err))
#' } #' }
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror) #' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
#' @export #' @export
#' #'
xgb.train <- function(params=list(), data, nrounds, watchlist = list(), xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, ...) { obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
early.stop.round = NULL, maximize = NULL, ...) {
dtrain <- data dtrain <- data
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list") stop("xgb.train: first argument params must be list")
@ -130,19 +136,85 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
} }
if (length(watchlist) != 0 && verbose == 0) { if (length(watchlist) != 0 && verbose == 0) {
warning('watchlist is provided but verbose=0, no evaluation information will be printed') warning('watchlist is provided but verbose=0, no evaluation information will be printed')
watchlist <- list()
} }
params = append(params, list(...)) params = append(params, list(...))
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.train: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective)=='function') {
obj = params$objective
params$objective = NULL
}
if (!is.null(params$eval_metric) && !is.null(feval))
stop("xgb.train: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric)=='function') {
feval = params$eval_metric
params$eval_metric = NULL
}
# Early stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (length(watchlist) == 0)
stop('For early stopping you need at least one set in watchlist.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize = FALSE
} else {
maximize = TRUE
}
}
if (maximize) {
bestScore = 0
} else {
bestScore = Inf
}
bestInd = 0
earlyStopflag = FALSE
if (length(watchlist)>1)
warning('Only the first data set in watchlist is used for early stopping process.')
}
handle <- xgb.Booster(params, append(watchlist, dtrain)) handle <- xgb.Booster(params, append(watchlist, dtrain))
bst <- xgb.handleToBooster(handle) bst <- xgb.handleToBooster(handle)
print.every.n=max( as.integer(print.every.n), 1L)
for (i in 1:nrounds) { for (i in 1:nrounds) {
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
if (length(watchlist) != 0) { if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
if (0== ( (i-1) %% print.every.n))
cat(paste(msg, "\n", sep="")) cat(paste(msg, "\n", sep=""))
if (!is.null(early.stop.round))
{
score = strsplit(msg,':|\\s+')[[1]][3]
score = as.numeric(score)
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
bestScore = score
bestInd = i
} else {
if (i-bestInd>=early.stop.round) {
earlyStopflag = TRUE
cat('Stopping. Best iteration:',bestInd)
break
}
}
}
} }
} }
bst <- xgb.Booster.check(bst) bst <- xgb.Booster.check(bst)
if (!is.null(early.stop.round)) {
bst$bestScore = bestScore
bst$bestInd = bestInd
}
return(bst) return(bst)
} }

View File

@ -28,8 +28,14 @@
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both #' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information #' performance and construction progress information
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @details #' @details
@ -51,7 +57,8 @@
#' @export #' @export
#' #'
xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds,
verbose = 1, ...) { verbose = 1, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, ...) {
if (is.null(missing)) { if (is.null(missing)) {
dtrain <- xgb.get.DMatrix(data, label) dtrain <- xgb.get.DMatrix(data, label)
} else { } else {
@ -66,7 +73,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
watchlist <- list() watchlist <- list()
} }
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose) bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
early.stop.round = early.stop.round)
return(bst) return(bst)
} }

View File

@ -8,11 +8,6 @@ For up-to-date version (which is recommended), please install from github. Windo
devtools::install_github('dmlc/xgboost',subdir='R-package') devtools::install_github('dmlc/xgboost',subdir='R-package')
``` ```
For stable version on CRAN, please run
```r
install.packages('xgboost')
```
## Examples ## Examples

View File

@ -6,3 +6,5 @@ generalized_linear_model Generalized Linear Model
cross_validation Cross validation cross_validation Cross validation
create_sparse_matrix Create Sparse Matrix create_sparse_matrix Create Sparse Matrix
predict_leaf_indices Predicting the corresponding leaves predict_leaf_indices Predicting the corresponding leaves
early_stopping Early Stop in training
poisson_regression Poisson Regression on count data

View File

@ -40,10 +40,10 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max.depth=2,eta=1,silent=1) param <- list(max.depth=2,eta=1,silent=1,
objective = logregobj, eval_metric = evalerror)
# train with customized objective # train with customized objective
xgb.cv(param, dtrain, nround, nfold = 5, xgb.cv(param, dtrain, nround, nfold = 5)
obj = logregobj, feval=evalerror)
# do cross validation with prediction values for each fold # do cross validation with prediction values for each fold
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE) res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)

View File

@ -8,7 +8,6 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default # note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction # note: what we are getting is margin value in prediction
# you must know what you are doing # you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2 num_round <- 2
@ -33,10 +32,13 @@ evalerror <- function(preds, dtrain) {
err <- as.numeric(sum(labels != (preds > 0)))/length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max.depth=2,eta=1,nthread = 2, silent=1,
objective=logregobj, eval_metric=evalerror)
print ('start training with user customized objective') print ('start training with user customized objective')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist)
# #
# there can be cases where you want additional information # there can be cases where you want additional information
@ -59,4 +61,5 @@ logregobjattr <- function(preds, dtrain) {
print ('start training with user customized objective, with additional attributes in DMatrix') print ('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist,
objective=logregobj, eval_metric=evalerror)

View File

@ -0,0 +1,40 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
watchlist <- list(eval = dtest)
num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
print ('start training with early Stopping setting')
bst <- xgb.train(param, dtrain, num_round, watchlist,
objective = logregobj, eval_metric = evalerror, maximize = FALSE,
early.stop.round = 3)
bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
objective = logregobj, eval_metric = evalerror,
maximize = FALSE, early.stop.round = 3)

View File

@ -0,0 +1,7 @@
data(mtcars)
head(mtcars)
bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
objective='count:poisson',nrounds=5)
pred = predict(bst,as.matrix(mtcars[,-11]))
sqrt(mean((pred-mtcars[,11])^2))

View File

@ -7,3 +7,5 @@ demo(generalized_linear_model)
demo(cross_validation) demo(cross_validation)
demo(create_sparse_matrix) demo(create_sparse_matrix)
demo(predict_leaf_indices) demo(predict_leaf_indices)
demo(early_stopping)
demo(poisson_regression)

View File

@ -7,7 +7,8 @@
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
verbose = T, ...) verbose = T, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, ...)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. Commonly used ones are: \item{params}{the list of parameters. Commonly used ones are:
@ -65,6 +66,15 @@ If folds are supplied, the nfold and stratified parameters would be ignored.}
\item{verbose}{\code{boolean}, print the statistics during the process} \item{verbose}{\code{boolean}, print the statistics during the process}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
} }
\value{ \value{

View File

@ -5,7 +5,8 @@
\title{eXtreme Gradient Boosting Training} \title{eXtreme Gradient Boosting Training}
\usage{ \usage{
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
feval = NULL, verbose = 1, ...) feval = NULL, verbose = 1, print.every.n = 1L,
early.stop.round = NULL, maximize = NULL, ...)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. \item{params}{the list of parameters.
@ -42,7 +43,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
3. Task Parameters 3. Task Parameters
\itemize{ \itemize{
\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
\itemize{ \itemize{
\item \code{reg:linear} linear regression (Default). \item \code{reg:linear} linear regression (Default).
\item \code{reg:logistic} logistic regression. \item \code{reg:logistic} logistic regression.
@ -54,7 +55,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
} }
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
}} }}
\item{data}{takes an \code{xgb.DMatrix} as the input.} \item{data}{takes an \code{xgb.DMatrix} as the input.}
@ -77,6 +78,15 @@ prediction and dtrain,}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both} information of performance. If 2, xgboost will print information of both}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
} }
\description{ \description{
@ -110,7 +120,6 @@ data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- dtrain dtest <- dtrain
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1)
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
@ -123,6 +132,7 @@ evalerror <- function(preds, dtrain) {
err <- as.numeric(sum(labels != (preds > 0)))/length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror) param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
} }

View File

@ -5,7 +5,8 @@
\title{eXtreme Gradient Boosting (Tree) library} \title{eXtreme Gradient Boosting (Tree) library}
\usage{ \usage{
xgboost(data = NULL, label = NULL, missing = NULL, params = list(), xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
nrounds, verbose = 1, ...) nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, ...)
} }
\arguments{ \arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@ -41,6 +42,15 @@ Commonly used ones are:
information of performance. If 2, xgboost will print information of both information of performance. If 2, xgboost will print information of both
performance and construction progress information} performance and construction progress information}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
} }
\description{ \description{

View File

@ -70,10 +70,10 @@ extern "C" {
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
_WrapperBegin(); _WrapperBegin();
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent)); void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP XGDMatrixCreateFromMat_R(SEXP mat,
@ -91,10 +91,10 @@ extern "C" {
} }
} }
void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing)); void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
@ -120,10 +120,10 @@ extern "C" {
} }
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata); BeginPtr(data_), nindptr, ndata);
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
@ -134,10 +134,10 @@ extern "C" {
idxvec[i] = INTEGER(idxset)[i] - 1; idxvec[i] = INTEGER(idxset)[i] - 1;
} }
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len); void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
@ -157,10 +157,7 @@ extern "C" {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]); vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
} }
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len); XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
_WrapperEnd(); } else {
return;
}
{
std::vector<float> vec(len); std::vector<float> vec(len);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
@ -177,12 +174,12 @@ extern "C" {
bst_ulong olen; bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen); CHAR(asChar(field)), &olen);
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen)); SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) { for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i]; REAL(ret)[i] = res[i];
} }
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixNumRow_R(SEXP handle) { SEXP XGDMatrixNumRow_R(SEXP handle) {
@ -203,10 +200,10 @@ extern "C" {
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i))); dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
} }
void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size()); void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
@ -252,10 +249,12 @@ extern "C" {
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str()); vec_sptr.push_back(vec_names[i].c_str());
} }
_WrapperEnd(); const char *ret =
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter), asInteger(iter),
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len)); BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);
_WrapperEnd();
return mkString(ret);
} }
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) { SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
_WrapperBegin(); _WrapperBegin();
@ -265,12 +264,12 @@ extern "C" {
asInteger(option_mask), asInteger(option_mask),
asInteger(ntree_limit), asInteger(ntree_limit),
&olen); &olen);
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen)); SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) { for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i]; REAL(ret)[i] = res[i];
} }
UNPROTECT(1); UNPROTECT(1);
_WrapperEnd();
return ret; return ret;
} }
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) { void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
@ -305,17 +304,18 @@ extern "C" {
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
_WrapperBegin(); _WrapperBegin();
bst_ulong olen; bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), const char **res =
XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)), CHAR(asChar(fmap)),
asInteger(with_stats), asInteger(with_stats),
&olen); &olen);
_WrapperEnd();
SEXP out = PROTECT(allocVector(STRSXP, olen)); SEXP out = PROTECT(allocVector(STRSXP, olen));
for (size_t i = 0; i < olen; ++i) { for (size_t i = 0; i < olen; ++i) {
stringstream stream; stringstream stream;
stream << "booster["<<i<<"]\n" << res[i]; stream << "booster["<<i<<"]\n" << res[i];
SET_STRING_ELT(out, i, mkChar(stream.str().c_str())); SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
} }
_WrapperEnd();
UNPROTECT(1); UNPROTECT(1);
return out; return out;
} }

View File

@ -57,11 +57,9 @@ devtools::install_github('dmlc/xgboost', subdir='R-package')
Cran version Cran version
------------ ------------
For stable version on *CRAN*, run: As of 2015-03-13, xgboost was removed from the CRAN repository.
```{r installCran, eval=FALSE} Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
install.packages('xgboost')
```
Learning Learning
======== ========

View File

@ -2,7 +2,7 @@ XGBoost: eXtreme Gradient Boosting
================================== ==================================
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data
Contributors: https://github.com/dmlc/xgboost/graphs/contributors Contributors: https://github.com/dmlc/xgboost/graphs/contributors
@ -20,33 +20,26 @@ Distributed Version: [Distributed XGBoost](multi-node)
Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects
What's New What's New
========== ==========
* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
* XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) * XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
- Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
* [External Memory Version](doc/external_memory.md) * [External Memory Version](doc/external_memory.md)
* XGBoost now support HDFS and S3
* [Distributed XGBoost now runs on YARN](https://github.com/dmlc/wormhole/tree/master/learn/xgboost)
* [xgboost user group](https://groups.google.com/forum/#!forum/xgboost-user/) for tracking changes, sharing your experience on xgboost
* New features in the lastest changes :)
- Distributed version that scale xgboost to even larger problems with cluster
- Feature importance visualization in R module, thanks to Michael Benesty
- Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py)
* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
Features Features
======== ========
* Sparse feature format: * Easily accessible in python, R, Julia, CLI
- Sparse feature format allows easy handling of missing values, and improve computation efficiency. * Fast speed and memory efficient
* Push the limit on single machine: - Can be more than 10 times faster than GBM in sklearn and R
- Efficient implementation that optimizes memory and computation. - Handles sparse matrices, support external memory
* Speed: XGBoost is very fast * Accurate prediction, and used extensively by data scientists and kagglers
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
* Layout of gradient boosting algorithm to support user defined objective * Distributed and Portable
* Distributed and portable - The distributed version runs on Hadoop (YARN), MPI, SGE etc.
- The distributed version of xgboost is highly portable and can be used in different platforms - Scales to billions of examples and beyond
- It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing.
Build Build
======= =======
@ -56,11 +49,9 @@ Build
Version Version
======= =======
* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility * Current version xgboost-0.4, a lot improvment has been made since 0.3
* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure - Change log in [CHANGES.md](CHANGES.md)
- This means the model and buffer file of previous version can not be loaded in xgboost-3.0 - This version is compatible with 0.3x versions
* For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
* Change log in [CHANGES.md](CHANGES.md)
XGBoost in Graphlab Create XGBoost in Graphlab Create
========================== ==========================

View File

@ -1,6 +1,7 @@
#!/usr/bin/python #!/usr/bin/python
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
import pickle
import xgboost as xgb import xgboost as xgb
### simple example ### simple example
@ -28,6 +29,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt')
# save dmatrix into binary buffer # save dmatrix into binary buffer
dtest.save_binary('dtest.buffer') dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model') bst.save_model('xgb.model')
# load model and data in # load model and data in
bst2 = xgb.Booster(model_file='xgb.model') bst2 = xgb.Booster(model_file='xgb.model')
@ -36,6 +38,14 @@ preds2 = bst2.predict(dtest2)
# assert they are the same # assert they are the same
assert np.sum(np.abs(preds2-preds)) == 0 assert np.sum(np.abs(preds2-preds)) == 0
# alternatively, you can pickle the booster
pks = pickle.dumps(bst2)
# load model and data in
bst3 = pickle.loads(pks)
preds3 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds3-preds)) == 0
### ###
# build dmatrix from scipy.sparse # build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse CSR Matrix') print ('start running example of build DMatrix from scipy.sparse CSR Matrix')

View File

@ -4,18 +4,17 @@ Created on 1 Apr 2015
@author: Jamie Hall @author: Jamie Hall
''' '''
import pickle
import xgboost as xgb import xgboost as xgb
import numpy as np import numpy as np
from sklearn.cross_validation import KFold from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston from sklearn.datasets import load_iris, load_digits, load_boston
rng = np.random.RandomState(31337) rng = np.random.RandomState(31337)
print("Zeros and Ones from the Digits dataset: binary classification") print("Zeros and Ones from the Digits dataset: binary classification")
digits = load_digits(2) digits = load_digits(2)
y = digits['target'] y = digits['target']
@ -60,4 +59,9 @@ clf.fit(X,y)
print(clf.best_score_) print(clf.best_score_)
print(clf.best_params_) print(clf.best_params_)
# The sklearn API models are picklable
print("Pickling sklearn API models")
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
print(np.allclose(clf.predict(X), clf2.predict(X)))

View File

@ -0,0 +1,35 @@
import os
if __name__ == "__main__":
# NOTE: on posix systems, this *has* to be here and in the
# `__name__ == "__main__"` clause to run XGBoost in parallel processes
# using fork, if XGBoost was built with OpenMP support. Otherwise, if you
# build XGBoost without OpenMP support, you can use fork, which is the
# default backend for joblib, and omit this.
try:
from multiprocessing import set_start_method
except ImportError:
raise ImportError("Unable to import multiprocessing.set_start_method."
" This example only runs on Python 3.4")
set_start_method("forkserver")
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_boston
import xgboost as xgb
rng = np.random.RandomState(31337)
print("Parallel Parameter optimization")
boston = load_boston()
os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

View File

@ -26,19 +26,26 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
#### Parameter for Tree Booster #### Parameter for Tree Booster
* eta [default=0.3] * eta [default=0.3]
- step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.
- range: [0,1]
* gamma [default=0] * gamma [default=0]
- minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
- range: [0,∞]
* max_depth [default=6] * max_depth [default=6]
- maximum depth of a tree - maximum depth of a tree
- range: [1,∞]
* min_child_weight [default=1] * min_child_weight [default=1]
- minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
- range: [0,∞]
* max_delta_step [default=0] * max_delta_step [default=0]
- Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
- range: [0,∞]
* subsample [default=1] * subsample [default=1]
- subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.
- range: (0,1]
* colsample_bytree [default=1] * colsample_bytree [default=1]
- subsample ratio of columns when constructing each tree. - subsample ratio of columns when constructing each tree.
- range: (0,1]
#### Parameter for Linear Booster #### Parameter for Linear Booster
* lambda [default=0] * lambda [default=0]

View File

@ -140,8 +140,12 @@ class IFMatrix {
* \brief check if column access is supported, if not, initialize column access * \brief check if column access is supported, if not, initialize column access
* \param enabled whether certain feature should be included in column access * \param enabled whether certain feature should be included in column access
* \param subsample subsample ratio when generating column access * \param subsample subsample ratio when generating column access
* \param max_row_perbatch auxilary information, maximum row used in each column batch
* this is a hint information that can be ignored by the implementation
*/ */
virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0; virtual void InitColAccess(const std::vector<bool> &enabled,
float subsample,
size_t max_row_perbatch) = 0;
// the following are column meta data, should be able to answer them fast // the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const = 0; virtual bool HaveColAccess(void) const = 0;

View File

@ -64,7 +64,13 @@ class GBTree : public IGradBooster {
} }
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree"); utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
if (with_pbuffer) {
fo.Write(&mparam, sizeof(ModelParam)); fo.Write(&mparam, sizeof(ModelParam));
} else {
ModelParam p = mparam;
p.num_pbuffer = 0;
fo.Write(&p, sizeof(ModelParam));
}
for (size_t i = 0; i < trees.size(); ++i) { for (size_t i = 0; i < trees.size(); ++i) {
trees[i]->SaveModel(fo); trees[i]->SaveModel(fo);
} }

View File

@ -33,10 +33,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
} }
virtual bool Next(void) { virtual bool Next(void) {
if (!itr.Next(page_)) return false; if (!itr.Next(page_)) return false;
out_.base_rowid = base_rowid_; out_ = page_->GetRowBatch(base_rowid_);
out_.ind_ptr = BeginPtr(page_->offset);
out_.data_ptr = BeginPtr(page_->data);
out_.size = page_->offset.size() - 1;
base_rowid_ += out_.size; base_rowid_ += out_.size;
return true; return true;
} }
@ -198,8 +195,8 @@ class DMatrixPageBase : public DataMatrix {
} }
/*! \brief magic number used to identify DMatrix */ /*! \brief magic number used to identify DMatrix */
static const int kMagic = TKMagic; static const int kMagic = TKMagic;
/*! \brief page size 64 MB */ /*! \brief page size 32 MB */
static const size_t kPageSize = 64UL << 20UL; static const size_t kPageSize = 32UL << 20UL;
protected: protected:
virtual void set_cache_file(const std::string &cache_file) = 0; virtual void set_cache_file(const std::string &cache_file) = 0;
@ -236,7 +233,7 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> {
class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> { class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
public: public:
DMatrixHalfRAM(void) { DMatrixHalfRAM(void) {
fmat_ = new FMatrixS(iter_); fmat_ = new FMatrixS(iter_, this->info);
} }
virtual ~DMatrixHalfRAM(void) { virtual ~DMatrixHalfRAM(void) {
delete fmat_; delete fmat_;

View File

@ -58,11 +58,13 @@ struct ColConvertFactory {
return true; return true;
} }
inline void Setup(float pkeep, inline void Setup(float pkeep,
size_t max_row_perbatch,
size_t num_col, size_t num_col,
utils::IIterator<RowBatch> *iter, utils::IIterator<RowBatch> *iter,
std::vector<bst_uint> *buffered_rowset, std::vector<bst_uint> *buffered_rowset,
const std::vector<bool> *enabled) { const std::vector<bool> *enabled) {
pkeep_ = pkeep; pkeep_ = pkeep;
max_row_perbatch_ = max_row_perbatch;
num_col_ = num_col; num_col_ = num_col;
iter_ = iter; iter_ = iter;
buffered_rowset_ = buffered_rowset; buffered_rowset_ = buffered_rowset;
@ -87,7 +89,8 @@ struct ColConvertFactory {
tmp_.Push(batch[i]); tmp_.Push(batch[i]);
} }
} }
if (tmp_.MemCostBytes() >= kPageSize) { if (tmp_.MemCostBytes() >= kPageSize ||
tmp_.Size() >= max_row_perbatch_) {
this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop, this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
*enabled_, val); *enabled_, val);
return true; return true;
@ -157,6 +160,8 @@ struct ColConvertFactory {
} }
// probability of keep // probability of keep
float pkeep_; float pkeep_;
// maximum number of rows per batch
size_t max_row_perbatch_;
// number of columns // number of columns
size_t num_col_; size_t num_col_;
// row batch iterator // row batch iterator
@ -208,10 +213,10 @@ class FMatrixPage : public IFMatrix {
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_; return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
} }
virtual void InitColAccess(const std::vector<bool> &enabled, virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep = 1.0f) { float pkeep, size_t max_row_perbatch) {
if (this->HaveColAccess()) return; if (this->HaveColAccess()) return;
if (TryLoadColData()) return; if (TryLoadColData()) return;
this->InitColData(enabled, pkeep); this->InitColData(enabled, pkeep, max_row_perbatch);
utils::Check(TryLoadColData(), "failed on creating col.blob"); utils::Check(TryLoadColData(), "failed on creating col.blob");
} }
/*! /*!
@ -282,7 +287,8 @@ class FMatrixPage : public IFMatrix {
* \brief intialize column data * \brief intialize column data
* \param pkeep probability to keep a row * \param pkeep probability to keep a row
*/ */
inline void InitColData(const std::vector<bool> &enabled, float pkeep) { inline void InitColData(const std::vector<bool> &enabled,
float pkeep, size_t max_row_perbatch) {
// clear rowset // clear rowset
buffered_rowset_.clear(); buffered_rowset_.clear();
col_size_.resize(info.num_col()); col_size_.resize(info.num_col());
@ -294,7 +300,7 @@ class FMatrixPage : public IFMatrix {
size_t bytes_write = 0; size_t bytes_write = 0;
utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer; utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
citer.SetParam("buffer_size", "2"); citer.SetParam("buffer_size", "2");
citer.get_factory().Setup(pkeep, info.num_col(), citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
iter_, &buffered_rowset_, &enabled); iter_, &buffered_rowset_, &enabled);
citer.Init(); citer.Init();
SparsePage *pcol; SparsePage *pcol;

View File

@ -28,7 +28,7 @@ class DMatrixSimple : public DataMatrix {
public: public:
// constructor // constructor
DMatrixSimple(void) : DataMatrix(kMagic) { DMatrixSimple(void) : DataMatrix(kMagic) {
fmat_ = new FMatrixS(new OneBatchIter(this)); fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
this->Clear(); this->Clear();
} }
// virtual destructor // virtual destructor
@ -171,7 +171,7 @@ class DMatrixSimple : public DataMatrix {
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname); utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
info.LoadBinary(fs); info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); LoadBinary(fs, &row_ptr_, &row_data_);
fmat_->LoadColAccess(fs); fmat_->LoadColAccess(fs);
if (!silent) { if (!silent) {
@ -198,9 +198,8 @@ class DMatrixSimple : public DataMatrix {
utils::FileStream fs(utils::FopenCheck(fname, "wb")); utils::FileStream fs(utils::FopenCheck(fname, "wb"));
int tmagic = kMagic; int tmagic = kMagic;
fs.Write(&tmagic, sizeof(tmagic)); fs.Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fs); info.SaveBinary(fs);
FMatrixS::SaveBinary(fs, row_ptr_, row_data_); SaveBinary(fs, row_ptr_, row_data_);
fmat_->SaveColAccess(fs); fmat_->SaveColAccess(fs);
fs.Close(); fs.Close();
@ -251,6 +250,42 @@ class DMatrixSimple : public DataMatrix {
static const int kMagic = 0xffffab01; static const int kMagic = 0xffffab01;
protected: protected:
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format");
}
}
// one batch iterator that return content in the matrix // one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<RowBatch> { struct OneBatchIter: utils::IIterator<RowBatch> {
explicit OneBatchIter(DMatrixSimple *parent) explicit OneBatchIter(DMatrixSimple *parent)

View File

@ -1,15 +1,18 @@
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP #ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP #define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
/*! /*!
* \file simple_fmatrix-inl.hpp * \file simple_fmatrix-inl.hpp
* \brief the input data structure for gradient boosting * \brief the input data structure for gradient boosting
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#include <limits>
#include "../data.h" #include "../data.h"
#include "../utils/utils.h" #include "../utils/utils.h"
#include "../utils/random.h" #include "../utils/random.h"
#include "../utils/omp.h" #include "../utils/omp.h"
#include "../learner/dmatrix.h"
#include "../utils/group_data.h" #include "../utils/group_data.h"
#include "./sparse_batch_page.h"
namespace xgboost { namespace xgboost {
namespace io { namespace io {
@ -20,7 +23,9 @@ class FMatrixS : public IFMatrix {
public: public:
typedef SparseBatch::Entry Entry; typedef SparseBatch::Entry Entry;
/*! \brief constructor */ /*! \brief constructor */
FMatrixS(utils::IIterator<RowBatch> *iter) { FMatrixS(utils::IIterator<RowBatch> *iter,
const learner::MetaInfo &info)
: info_(info) {
this->iter_ = iter; this->iter_ = iter;
} }
// destructor // destructor
@ -29,12 +34,12 @@ class FMatrixS : public IFMatrix {
} }
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const { virtual bool HaveColAccess(void) const {
return col_ptr_.size() != 0; return col_size_.size() != 0;
} }
/*! \brief get number of colmuns */ /*! \brief get number of colmuns */
virtual size_t NumCol(void) const { virtual size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access"); utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1; return col_size_.size() - 1;
} }
/*! \brief get number of buffered rows */ /*! \brief get number of buffered rows */
virtual const std::vector<bst_uint> &buffered_rowset(void) const { virtual const std::vector<bst_uint> &buffered_rowset(void) const {
@ -42,17 +47,17 @@ class FMatrixS : public IFMatrix {
} }
/*! \brief get column size */ /*! \brief get column size */
virtual size_t GetColSize(size_t cidx) const { virtual size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx]; return col_size_[cidx];
} }
/*! \brief get column density */ /*! \brief get column density */
virtual float GetColDensity(size_t cidx) const { virtual float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size(); return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
} }
virtual void InitColAccess(const std::vector<bool> &enabled, virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep = 1.0f) { float pkeep, size_t max_row_perbatch) {
if (this->HaveColAccess()) return; if (this->HaveColAccess()) return;
this->InitColData(pkeep, enabled); this->InitColData(enabled, pkeep, max_row_perbatch);
} }
/*! /*!
* \brief get the row iterator associated with FMatrix * \brief get the row iterator associated with FMatrix
@ -70,7 +75,7 @@ class FMatrixS : public IFMatrix {
for (size_t i = 0; i < ncol; ++i) { for (size_t i = 0; i < ncol; ++i) {
col_iter_.col_index_[i] = static_cast<bst_uint>(i); col_iter_.col_index_[i] = static_cast<bst_uint>(i);
} }
col_iter_.SetBatch(col_ptr_, col_data_); col_iter_.BeforeFirst();
return &col_iter_; return &col_iter_;
} }
/*! /*!
@ -82,7 +87,7 @@ class FMatrixS : public IFMatrix {
for (size_t i = 0; i < fset.size(); ++i) { for (size_t i = 0; i < fset.size(); ++i) {
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
} }
col_iter_.SetBatch(col_ptr_, col_data_); col_iter_.BeforeFirst();
return &col_iter_; return &col_iter_;
} }
/*! /*!
@ -90,64 +95,52 @@ class FMatrixS : public IFMatrix {
* \param fo output stream to save to * \param fo output stream to save to
*/ */
inline void SaveColAccess(utils::IStream &fo) const { inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_); size_t n = 0;
if (buffered_rowset_.size() != 0) { fo.Write(&n, sizeof(n));
SaveBinary(fo, col_ptr_, col_data_);
}
} }
/*! /*!
* \brief load column access data from stream * \brief load column access data from stream
* \param fo output stream to load from * \param fo output stream to load from
*/ */
inline void LoadColAccess(utils::IStream &fi) { inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format"); // do nothing in load col access
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format");
}
} }
protected: protected:
/*! /*!
* \brief intialize column data * \brief intialize column data
* \param enabled the list of enabled columns
* \param pkeep probability to keep a row * \param pkeep probability to keep a row
* \param max_row_perbatch maximum row per batch
*/ */
inline void InitColData(float pkeep, const std::vector<bool> &enabled) { inline void InitColData(const std::vector<bool> &enabled,
float pkeep, size_t max_row_perbatch) {
col_iter_.Clear();
if (info_.num_row() < max_row_perbatch) {
SparsePage *page = new SparsePage();
this->MakeOneBatch(enabled, pkeep, page);
col_iter_.cpages_.push_back(page);
} else {
this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
}
// setup col-size
col_size_.resize(info_.num_col());
std::fill(col_size_.begin(), col_size_.end(), 0);
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
SparsePage *pcol = col_iter_.cpages_[i];
for (size_t j = 0; j < pcol->Size(); ++j) {
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
}
}
}
/*!
* \brief make column page from iterator
* \param pkeep probability to keep a row
* \param pcol the target column
*/
inline void MakeOneBatch(const std::vector<bool> &enabled,
float pkeep,
SparsePage *pcol) {
// clear rowset // clear rowset
buffered_rowset_.clear(); buffered_rowset_.clear();
// bit map // bit map
@ -157,8 +150,9 @@ class FMatrixS : public IFMatrix {
{ {
nthread = omp_get_num_threads(); nthread = omp_get_num_threads();
} }
// build the column matrix in parallel pcol->Clear();
utils::ParallelGroupBuilder<RowBatch::Entry> builder(&col_ptr_, &col_data_); utils::ParallelGroupBuilder<SparseBatch::Entry>
builder(&pcol->offset, &pcol->data);
builder.InitBudget(0, nthread); builder.InitBudget(0, nthread);
// start working // start working
iter_->BeforeFirst(); iter_->BeforeFirst();
@ -209,66 +203,167 @@ class FMatrixS : public IFMatrix {
} }
} }
} }
utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
// sort columns // sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol()); bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
for (bst_omp_uint i = 0; i < ncol; ++i) { for (bst_omp_uint i = 0; i < ncol; ++i) {
if (col_ptr_[i] < col_ptr_[i + 1]) { if (pcol->offset[i] < pcol->offset[i + 1]) {
std::sort(BeginPtr(col_data_) + col_ptr_[i], std::sort(BeginPtr(pcol->data) + pcol->offset[i],
BeginPtr(col_data_) + col_ptr_[i + 1], Entry::CmpValue); BeginPtr(pcol->data) + pcol->offset[i + 1],
SparseBatch::Entry::CmpValue);
}
}
}
inline void MakeManyBatch(const std::vector<bool> &enabled,
float pkeep, size_t max_row_perbatch) {
size_t btop = 0;
buffered_rowset_.clear();
// internal temp cache
SparsePage tmp; tmp.Clear();
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(ridx);
tmp.Push(batch[i]);
}
if (tmp.Size() >= max_row_perbatch) {
SparsePage *page = new SparsePage();
this->MakeColPage(tmp.GetRowBatch(0),
BeginPtr(buffered_rowset_) + btop,
enabled, page);
col_iter_.cpages_.push_back(page);
btop = buffered_rowset_.size();
tmp.Clear();
}
}
}
if (tmp.Size() != 0) {
SparsePage *page = new SparsePage();
this->MakeColPage(tmp.GetRowBatch(0),
BeginPtr(buffered_rowset_) + btop,
enabled, page);
col_iter_.cpages_.push_back(page);
}
}
// make column page from subset of rowbatchs
inline void MakeColPage(const RowBatch &batch,
const bst_uint *ridx,
const std::vector<bool> &enabled,
SparsePage *pcol) {
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
if (nthread > max_nthread) {
nthread = max_nthread;
}
}
pcol->Clear();
utils::ParallelGroupBuilder<SparseBatch::Entry>
builder(&pcol->offset, &pcol->data);
builder.InitBudget(info_.num_col(), nthread);
bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
#pragma omp parallel for schedule(static) num_threads(nthread)
for (bst_omp_uint i = 0; i < ndata; ++i) {
int tid = omp_get_thread_num();
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
const SparseBatch::Entry &e = inst[j];
if (enabled[e.index]) {
builder.AddBudget(e.index, tid);
}
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static) num_threads(nthread)
for (bst_omp_uint i = 0; i < ndata; ++i) {
int tid = omp_get_thread_num();
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
const SparseBatch::Entry &e = inst[j];
builder.Push(e.index,
SparseBatch::Entry(ridx[i], e.fvalue),
tid);
}
}
utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
for (bst_omp_uint i = 0; i < ncol; ++i) {
if (pcol->offset[i] < pcol->offset[i + 1]) {
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
BeginPtr(pcol->data) + pcol->offset[i + 1],
SparseBatch::Entry::CmpValue);
} }
} }
} }
private: private:
// one batch iterator that return content in the matrix // one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<ColBatch> { struct ColBatchIter: utils::IIterator<ColBatch> {
OneBatchIter(void) : at_first_(true){} ColBatchIter(void) : data_ptr_(0) {}
virtual ~OneBatchIter(void) {} virtual ~ColBatchIter(void) {
this->Clear();
}
virtual void BeforeFirst(void) { virtual void BeforeFirst(void) {
at_first_ = true; data_ptr_ = 0;
} }
virtual bool Next(void) { virtual bool Next(void) {
if (!at_first_) return false; if (data_ptr_ >= cpages_.size()) return false;
at_first_ = false; data_ptr_ += 1;
SparsePage *pcol = cpages_[data_ptr_ - 1];
batch_.size = col_index_.size();
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
for (size_t i = 0; i < col_data_.size(); ++i) {
const bst_uint ridx = col_index_[i];
col_data_[i] = SparseBatch::Inst
(BeginPtr(pcol->data) + pcol->offset[ridx],
static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
}
batch_.col_index = BeginPtr(col_index_);
batch_.col_data = BeginPtr(col_data_);
return true; return true;
} }
virtual const ColBatch &Value(void) const { virtual const ColBatch &Value(void) const {
return batch_; return batch_;
} }
inline void SetBatch(const std::vector<size_t> &ptr, inline void Clear(void) {
const std::vector<ColBatch::Entry> &data) { for (size_t i = 0; i < cpages_.size(); ++i) {
batch_.size = col_index_.size(); delete cpages_[i];
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
for (size_t i = 0; i < col_data_.size(); ++i) {
const bst_uint ridx = col_index_[i];
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
} }
batch_.col_index = BeginPtr(col_index_); cpages_.clear();
batch_.col_data = BeginPtr(col_data_);
this->BeforeFirst();
} }
// data content // data content
std::vector<bst_uint> col_index_; std::vector<bst_uint> col_index_;
// column content
std::vector<ColBatch::Inst> col_data_; std::vector<ColBatch::Inst> col_data_;
// whether is at first // column sparse pages
bool at_first_; std::vector<SparsePage*> cpages_;
// data pointer
size_t data_ptr_;
// temporal space for batch // temporal space for batch
ColBatch batch_; ColBatch batch_;
}; };
// --- data structure used to support InitColAccess -- // --- data structure used to support InitColAccess --
// column iterator // column iterator
OneBatchIter col_iter_; ColBatchIter col_iter_;
// shared meta info with DMatrix
const learner::MetaInfo &info_;
// row iterator // row iterator
utils::IIterator<RowBatch> *iter_; utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */ /*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_; std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */ // count for column data
std::vector<size_t> col_ptr_; std::vector<size_t> col_size_;
/*! \brief column datas in CSC format */
std::vector<ColBatch::Entry> col_data_;
}; };
} // namespace io } // namespace io
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP #endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP

View File

@ -178,9 +178,23 @@ class SparsePage {
offset.push_back(offset.back() + inst.length); offset.push_back(offset.back() + inst.length);
size_t begin = data.size(); size_t begin = data.size();
data.resize(begin + inst.length); data.resize(begin + inst.length);
if (inst.length != 0) {
std::memcpy(BeginPtr(data) + begin, inst.data, std::memcpy(BeginPtr(data) + begin, inst.data,
sizeof(SparseBatch::Entry) * inst.length); sizeof(SparseBatch::Entry) * inst.length);
} }
}
/*!
* \param base_rowid base_rowid of the data
* \return row batch representation of the page
*/
inline RowBatch GetRowBatch(size_t base_rowid) const {
RowBatch out;
out.base_rowid = base_rowid;
out.ind_ptr = BeginPtr(offset);
out.data_ptr = BeginPtr(data);
out.size = offset.size() - 1;
return out;
}
private: private:
/*! \brief external memory column offset */ /*! \brief external memory column offset */

View File

@ -33,6 +33,7 @@ class BoostLearner : public rabit::Serializable {
silent= 0; silent= 0;
prob_buffer_row = 1.0f; prob_buffer_row = 1.0f;
distributed_mode = 0; distributed_mode = 0;
updater_mode = 0;
pred_buffer_size = 0; pred_buffer_size = 0;
seed_per_iteration = 0; seed_per_iteration = 0;
seed = 0; seed = 0;
@ -95,6 +96,7 @@ class BoostLearner : public rabit::Serializable {
utils::Error("%s is invalid value for dsplit, should be row or col", val); utils::Error("%s is invalid value for dsplit, should be row or col", val);
} }
} }
if (!strcmp(name, "updater_mode")) updater_mode = atoi(val);
if (!strcmp(name, "prob_buffer_row")) { if (!strcmp(name, "prob_buffer_row")) {
prob_buffer_row = static_cast<float>(atof(val)); prob_buffer_row = static_cast<float>(atof(val));
utils::Check(distributed_mode == 0, utils::Check(distributed_mode == 0,
@ -157,11 +159,9 @@ class BoostLearner : public rabit::Serializable {
/*! /*!
* \brief load model from stream * \brief load model from stream
* \param fi input stream * \param fi input stream
* \param with_pbuffer whether to load with predict buffer
* \param calc_num_feature whether call InitTrainer with calc_num_feature * \param calc_num_feature whether call InitTrainer with calc_num_feature
*/ */
inline void LoadModel(utils::IStream &fi, inline void LoadModel(utils::IStream &fi,
bool with_pbuffer = true,
bool calc_num_feature = true) { bool calc_num_feature = true) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0, utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format"); "BoostLearner: wrong model format");
@ -189,15 +189,15 @@ class BoostLearner : public rabit::Serializable {
char tmp[32]; char tmp[32];
utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class); utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
obj_->SetParam("num_class", tmp); obj_->SetParam("num_class", tmp);
gbm_->LoadModel(fi, with_pbuffer); gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0);
if (!with_pbuffer || distributed_mode == 2) { if (mparam.saved_with_pbuffer == 0) {
gbm_->ResetPredBuffer(pred_buffer_size); gbm_->ResetPredBuffer(pred_buffer_size);
} }
} }
// rabit load model from rabit checkpoint // rabit load model from rabit checkpoint
virtual void Load(rabit::Stream *fi) { virtual void Load(rabit::Stream *fi) {
// for row split, we should not keep pbuffer // for row split, we should not keep pbuffer
this->LoadModel(*fi, distributed_mode != 2, false); this->LoadModel(*fi, false);
} }
// rabit save model to rabit checkpoint // rabit save model to rabit checkpoint
virtual void Save(rabit::Stream *fo) const { virtual void Save(rabit::Stream *fo) const {
@ -218,18 +218,20 @@ class BoostLearner : public rabit::Serializable {
if (header == "bs64") { if (header == "bs64") {
utils::Base64InStream bsin(fi); utils::Base64InStream bsin(fi);
bsin.InitPosition(); bsin.InitPosition();
this->LoadModel(bsin); this->LoadModel(bsin, true);
} else if (header == "binf") { } else if (header == "binf") {
this->LoadModel(*fi); this->LoadModel(*fi, true);
} else { } else {
delete fi; delete fi;
fi = utils::IStream::Create(fname, "r"); fi = utils::IStream::Create(fname, "r");
this->LoadModel(*fi); this->LoadModel(*fi, true);
} }
delete fi; delete fi;
} }
inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const { inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
fo.Write(&mparam, sizeof(ModelParam)); ModelParam p = mparam;
p.saved_with_pbuffer = static_cast<int>(with_pbuffer);
fo.Write(&p, sizeof(ModelParam));
fo.Write(name_obj_); fo.Write(name_obj_);
fo.Write(name_gbm_); fo.Write(name_gbm_);
gbm_->SaveModel(fo, with_pbuffer); gbm_->SaveModel(fo, with_pbuffer);
@ -237,17 +239,18 @@ class BoostLearner : public rabit::Serializable {
/*! /*!
* \brief save model into file * \brief save model into file
* \param fname file name * \param fname file name
* \param with_pbuffer whether save pbuffer together
*/ */
inline void SaveModel(const char *fname) const { inline void SaveModel(const char *fname, bool with_pbuffer) const {
utils::IStream *fo = utils::IStream::Create(fname, "w"); utils::IStream *fo = utils::IStream::Create(fname, "w");
if (save_base64 != 0 || !strcmp(fname, "stdout")) { if (save_base64 != 0 || !strcmp(fname, "stdout")) {
fo->Write("bs64\t", 5); fo->Write("bs64\t", 5);
utils::Base64OutStream bout(fo); utils::Base64OutStream bout(fo);
this->SaveModel(bout); this->SaveModel(bout, with_pbuffer);
bout.Finish('\n'); bout.Finish('\n');
} else { } else {
fo->Write("binf", 4); fo->Write("binf", 4);
this->SaveModel(*fo); this->SaveModel(*fo, with_pbuffer);
} }
delete fo; delete fo;
} }
@ -259,8 +262,16 @@ class BoostLearner : public rabit::Serializable {
inline void CheckInit(DMatrix *p_train) { inline void CheckInit(DMatrix *p_train) {
int ncol = static_cast<int>(p_train->info.info.num_col); int ncol = static_cast<int>(p_train->info.info.num_col);
std::vector<bool> enabled(ncol, true); std::vector<bool> enabled(ncol, true);
// set max row per batch to limited value
// in distributed mode, use safe choice otherwise
size_t max_row_perbatch = std::numeric_limits<size_t>::max();
if (updater_mode != 0 || distributed_mode == 2) {
max_row_perbatch = 32UL << 10UL;
}
// initialize column access // initialize column access
p_train->fmat()->InitColAccess(enabled, prob_buffer_row); p_train->fmat()->InitColAccess(enabled,
prob_buffer_row,
max_row_perbatch);
const int kMagicPage = 0xffffab02; const int kMagicPage = 0xffffab02;
// check, if it is DMatrixPage, then use hist maker // check, if it is DMatrixPage, then use hist maker
if (p_train->magic == kMagicPage) { if (p_train->magic == kMagicPage) {
@ -442,14 +453,17 @@ class BoostLearner : public rabit::Serializable {
unsigned num_feature; unsigned num_feature;
/* \brief number of class, if it is multi-class classification */ /* \brief number of class, if it is multi-class classification */
int num_class; int num_class;
/*! \brief whether the model itself is saved with pbuffer */
int saved_with_pbuffer;
/*! \brief reserved field */ /*! \brief reserved field */
int reserved[31]; int reserved[30];
/*! \brief constructor */ /*! \brief constructor */
ModelParam(void) { ModelParam(void) {
std::memset(this, 0, sizeof(ModelParam));
base_score = 0.5f; base_score = 0.5f;
num_feature = 0; num_feature = 0;
num_class = 0; num_class = 0;
std::memset(reserved, 0, sizeof(reserved)); saved_with_pbuffer = 0;
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -476,6 +490,8 @@ class BoostLearner : public rabit::Serializable {
int silent; int silent;
// distributed learning mode, if any, 0:none, 1:col, 2:row // distributed learning mode, if any, 0:none, 1:col, 2:row
int distributed_mode; int distributed_mode;
// updater mode, 0:normal, reserved for internal test
int updater_mode;
// cached size of predict buffer // cached size of predict buffer
size_t pred_buffer_size; size_t pred_buffer_size;
// maximum buffred row value // maximum buffred row value

View File

@ -48,6 +48,8 @@ struct TrainParam{
int size_leaf_vector; int size_leaf_vector;
// option for parallelization // option for parallelization
int parallel_option; int parallel_option;
// option to open cacheline optimizaton
int cache_opt;
// number of threads to be used for tree construction, // number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default // if OpenMP is enabled, if equals 0, use system default
int nthread; int nthread;
@ -70,6 +72,7 @@ struct TrainParam{
parallel_option = 2; parallel_option = 2;
sketch_eps = 0.1f; sketch_eps = 0.1f;
sketch_ratio = 2.0f; sketch_ratio = 2.0f;
cache_opt = 1;
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -96,6 +99,7 @@ struct TrainParam{
if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val)); if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val)); if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val); if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val);
if (!strcmp(name, "parallel_option")) parallel_option = atoi(val); if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
@ -151,12 +155,12 @@ struct TrainParam{
return dw; return dw;
} }
/*! \brief whether need forward small to big search: default right */ /*! \brief whether need forward small to big search: default right */
inline bool need_forward_search(float col_density = 0.0f) const { inline bool need_forward_search(float col_density, bool indicator) const {
return this->default_direction == 2 || return this->default_direction == 2 ||
(default_direction == 0 && (col_density < opt_dense_col)); (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
} }
/*! \brief whether need backward big to small search: default left */ /*! \brief whether need backward big to small search: default left */
inline bool need_backward_search(float col_density = 0.0f) const { inline bool need_backward_search(float col_density, bool indicator) const {
return this->default_direction != 2; return this->default_direction != 2;
} }
/*! \brief given the loss change, whether we need to invode prunning */ /*! \brief given the loss change, whether we need to invode prunning */
@ -192,6 +196,11 @@ struct GradStats {
double sum_grad; double sum_grad;
/*! \brief sum hessian statistics */ /*! \brief sum hessian statistics */
double sum_hess; double sum_hess;
/*!
* \brief whether this is simply statistics and we only need to call
* Add(gpair), instead of Add(gpair, info, ridx)
*/
static const int kSimpleStats = 1;
/*! \brief constructor, the object must be cleared during construction */ /*! \brief constructor, the object must be cleared during construction */
explicit GradStats(const TrainParam &param) { explicit GradStats(const TrainParam &param) {
this->Clear(); this->Clear();
@ -204,7 +213,14 @@ struct GradStats {
inline static void CheckInfo(const BoosterInfo &info) { inline static void CheckInfo(const BoosterInfo &info) {
} }
/*! /*!
* \brief accumulate statistics, * \brief accumulate statistics
* \param p the gradient pair
*/
inline void Add(bst_gpair p) {
this->Add(p.grad, p.hess);
}
/*!
* \brief accumulate statistics, more complicated version
* \param gpair the vector storing the gradient statistics * \param gpair the vector storing the gradient statistics
* \param info the additional information * \param info the additional information
* \param ridx instance index of this instance * \param ridx instance index of this instance

View File

@ -234,8 +234,9 @@ class ColMaker: public IUpdater {
const IFMatrix &fmat, const IFMatrix &fmat,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const BoosterInfo &info) { const BoosterInfo &info) {
bool need_forward = param.need_forward_search(fmat.GetColDensity(fid)); const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
bool need_backward = param.need_backward_search(fmat.GetColDensity(fid)); bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
const std::vector<int> &qexpand = qexpand_; const std::vector<int> &qexpand = qexpand_;
#pragma omp parallel #pragma omp parallel
{ {
@ -357,6 +358,99 @@ class ColMaker: public IUpdater {
} }
} }
} }
// update enumeration solution
inline void UpdateEnumeration(int nid, bst_gpair gstats,
float fvalue, int d_step, bst_uint fid,
TStats &c, std::vector<ThreadEntry> &temp) {
// get the statistics of nid
ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init
if (e.stats.Empty()) {
e.stats.Add(gstats);
e.last_fvalue = fvalue;
} else {
// try to find a split
if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
}
}
// update the statistics
e.stats.Add(gstats);
e.last_fvalue = fvalue;
}
}
// same as EnumerateSplit, with cacheline prefetch optimization
inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
const ColBatch::Entry *end,
int d_step,
bst_uint fid,
const std::vector<bst_gpair> &gpair,
std::vector<ThreadEntry> &temp) {
const std::vector<int> &qexpand = qexpand_;
// clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear();
}
// left statistics
TStats c(param);
// local cache buffer for position and gradient pair
const int kBuffer = 32;
int buf_position[kBuffer];
bst_gpair buf_gpair[kBuffer];
// aligned ending position
const ColBatch::Entry *align_end;
if (d_step > 0) {
align_end = begin + (end - begin) / kBuffer * kBuffer;
} else {
align_end = begin - (begin - end) / kBuffer * kBuffer;
}
int i;
const ColBatch::Entry *it;
const int align_step = d_step * kBuffer;
// internal cached loop
for (it = begin; it != align_end; it += align_step) {
const ColBatch::Entry *p;
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
buf_position[i] = position[p->index];
buf_gpair[i] = gpair[p->index];
}
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
const int nid = buf_position[i];
if (nid < 0) continue;
this->UpdateEnumeration(nid, buf_gpair[i],
p->fvalue, d_step,
fid, c, temp);
}
}
// finish up the ending piece
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
buf_position[i] = position[it->index];
buf_gpair[i] = gpair[it->index];
}
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
const int nid = buf_position[i];
if (nid < 0) continue;
this->UpdateEnumeration(nid, buf_gpair[i],
it->fvalue, d_step,
fid, c, temp);
}
// finish updating all statistics, check if it is possible to include all sum statistics
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
ThreadEntry &e = temp[nid];
c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float gap = std::abs(e.last_fvalue) + rt_eps;
const float delta = d_step == +1 ? gap: -gap;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
}
}
}
// enumerate the split values of specific feature // enumerate the split values of specific feature
inline void EnumerateSplit(const ColBatch::Entry *begin, inline void EnumerateSplit(const ColBatch::Entry *begin,
const ColBatch::Entry *end, const ColBatch::Entry *end,
@ -365,6 +459,11 @@ class ColMaker: public IUpdater {
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<ThreadEntry> &temp) { std::vector<ThreadEntry> &temp) {
// use cacheline aware optimization
if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
return;
}
const std::vector<int> &qexpand = qexpand_; const std::vector<int> &qexpand = qexpand_;
// clear all the temp statistics // clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) { for (size_t j = 0; j < qexpand.size(); ++j) {
@ -411,6 +510,7 @@ class ColMaker: public IUpdater {
} }
} }
} }
// update the solution candidate // update the solution candidate
virtual void UpdateSolution(const ColBatch &batch, virtual void UpdateSolution(const ColBatch &batch,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
@ -431,11 +531,12 @@ class ColMaker: public IUpdater {
const bst_uint fid = batch.col_index[i]; const bst_uint fid = batch.col_index[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
const ColBatch::Inst c = batch[i]; const ColBatch::Inst c = batch[i];
if (param.need_forward_search(fmat.GetColDensity(fid))) { const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
this->EnumerateSplit(c.data, c.data + c.length, +1, this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]); fid, gpair, info, stemp[tid]);
} }
if (param.need_backward_search(fmat.GetColDensity(fid))) { if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]); fid, gpair, info, stemp[tid]);
} }
@ -550,8 +651,8 @@ class ColMaker: public IUpdater {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) { for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index; const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx); const int nid = this->DecodePosition(ridx);
const float fvalue = col[j].fvalue;
// go back to parent, correct those who are not default // go back to parent, correct those who are not default
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) { if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
if(fvalue < tree[nid].split_cond()) { if(fvalue < tree[nid].split_cond()) {

View File

@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
utils::Assert(istart != hist.size, "the bound variable must be max"); utils::Assert(istart != hist.size, "the bound variable must be max");
hist.data[istart].Add(gpair, info, ridx); hist.data[istart].Add(gpair, info, ridx);
} }
/*!
* \brief add a histogram to data,
* do linear scan, start from istart
*/
inline void Add(bst_float fv,
bst_gpair gstats) {
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
utils::Assert(istart != hist.size, "the bound variable must be max");
hist.data[istart].Add(gstats);
}
}; };
// sketch type used for this // sketch type used for this
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch; typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
@ -479,6 +489,32 @@ class CQHistMaker: public HistMaker<TStats> {
hbuilder[nid].istart = 0; hbuilder[nid].istart = 0;
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)]; hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
} }
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
const bst_uint kBuffer = 32;
bst_uint align_length = c.length / kBuffer * kBuffer;
int buf_position[kBuffer];
bst_gpair buf_gpair[kBuffer];
for (bst_uint j = 0; j < align_length; j += kBuffer) {
for (bst_uint i = 0; i < kBuffer; ++i) {
bst_uint ridx = c[j + i].index;
buf_position[i] = this->position[ridx];
buf_gpair[i] = gpair[ridx];
}
for (bst_uint i = 0; i < kBuffer; ++i) {
const int nid = buf_position[i];
if (nid >= 0) {
hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
}
}
}
for (bst_uint j = align_length; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
}
}
} else {
for (bst_uint j = 0; j < c.length; ++j) { for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index; const bst_uint ridx = c[j].index;
const int nid = this->position[ridx]; const int nid = this->position[ridx];
@ -487,6 +523,7 @@ class CQHistMaker: public HistMaker<TStats> {
} }
} }
} }
}
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair, inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
const ColBatch::Inst &c, const ColBatch::Inst &c,
const RegTree &tree, const RegTree &tree,
@ -536,6 +573,32 @@ class CQHistMaker: public HistMaker<TStats> {
sbuilder[nid].Init(max_size); sbuilder[nid].Init(max_size);
} }
// second pass, build the sketch // second pass, build the sketch
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
const bst_uint kBuffer = 32;
bst_uint align_length = c.length / kBuffer * kBuffer;
int buf_position[kBuffer];
bst_float buf_hess[kBuffer];
for (bst_uint j = 0; j < align_length; j += kBuffer) {
for (bst_uint i = 0; i < kBuffer; ++i) {
bst_uint ridx = c[j + i].index;
buf_position[i] = this->position[ridx];
buf_hess[i] = gpair[ridx].hess;
}
for (bst_uint i = 0; i < kBuffer; ++i) {
const int nid = buf_position[i];
if (nid >= 0) {
sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
}
}
}
for (bst_uint j = align_length; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
}
}
} else {
for (bst_uint j = 0; j < c.length; ++j) { for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index; const bst_uint ridx = c[j].index;
const int nid = this->position[ridx]; const int nid = this->position[ridx];
@ -543,6 +606,7 @@ class CQHistMaker: public HistMaker<TStats> {
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
} }
} }
}
for (size_t i = 0; i < this->qexpand.size(); ++i) { for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i]; const int nid = this->qexpand[i];
sbuilder[nid].Finalize(max_size); sbuilder[nid].Finalize(max_size);

View File

@ -328,12 +328,12 @@ struct WXQSummary : public WQSummary<DType, RType> {
} }
if (nbig >= n - 1) { if (nbig >= n - 1) {
// see what was the case // see what was the case
fprintf(stderr, "LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n); utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
fprintf(stderr, "LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n", utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
src.size, maxsize, static_cast<double>(range), src.size, maxsize, static_cast<double>(range),
static_cast<double>(chunk)); static_cast<double>(chunk));
for (size_t i = 0; i < src.size; ++i) { for (size_t i = 0; i < src.size; ++i) {
printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i, utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
src.data[i].rmin, src.data[i].rmax, src.data[i].wmin, src.data[i].rmin, src.data[i].rmax, src.data[i].wmin,
src.data[i].value, CheckLarge(src.data[i], chunk)); src.data[i].value, CheckLarge(src.data[i], chunk));
} }

View File

@ -87,6 +87,7 @@ class BoostLearnTask {
if (!strcmp("name_pred", name)) name_pred = val; if (!strcmp("name_pred", name)) name_pred = val;
if (!strcmp("dsplit", name)) data_split = val; if (!strcmp("dsplit", name)) data_split = val;
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
if (!strncmp("eval[", name, 5)) { if (!strncmp("eval[", name, 5)) {
char evname[256]; char evname[256];
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
@ -115,6 +116,7 @@ class BoostLearnTask {
model_dir_path = "./"; model_dir_path = "./";
data_split = "NONE"; data_split = "NONE";
load_part = 0; load_part = 0;
save_with_pbuffer = 0;
data = NULL; data = NULL;
} }
~BoostLearnTask(void){ ~BoostLearnTask(void){
@ -241,7 +243,7 @@ class BoostLearnTask {
} }
inline void SaveModel(const char *fname) const { inline void SaveModel(const char *fname) const {
if (rabit::GetRank() != 0) return; if (rabit::GetRank() != 0) return;
learner.SaveModel(fname); learner.SaveModel(fname, save_with_pbuffer != 0);
} }
inline void SaveModel(int i) const { inline void SaveModel(int i) const {
char fname[256]; char fname[256];
@ -297,6 +299,8 @@ class BoostLearnTask {
int pred_margin; int pred_margin;
/*! \brief whether dump statistics along with model */ /*! \brief whether dump statistics along with model */
int dump_model_stats; int dump_model_stats;
/*! \brief whether save prediction buffer */
int save_with_pbuffer;
/*! \brief name of feature map */ /*! \brief name of feature map */
std::string name_fmap; std::string name_fmap;
/*! \brief name of dump file */ /*! \brief name of dump file */

View File

@ -7,6 +7,8 @@ Python
* To make the python module, type ```./build.sh``` in the root directory of project * To make the python module, type ```./build.sh``` in the root directory of project
* Install with `python setup.py install` from this directory. * Install with `python setup.py install` from this directory.
* Refer also to the walk through example in [demo folder](../demo/guide-python) * Refer also to the walk through example in [demo folder](../demo/guide-python)
* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
R R
===== =====

View File

@ -28,7 +28,7 @@ if len(lib_path) == 0:
raise XGBoostLibraryNotFound("XGBoost library not found. Did you run " raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
"../make?") "../make?")
setup(name="xgboost", setup(name="xgboost",
version="0.32", version="0.40",
description="Python wrappers for XGBoost: eXtreme Gradient Boosting", description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
zip_safe=False, zip_safe=False,
py_modules=['xgboost'], py_modules=['xgboost'],

View File

@ -1,8 +1,8 @@
# coding: utf-8 # coding: utf-8
""" """
xgboost: eXtreme Gradient Boosting library xgboost: eXtreme Gradient Boosting library
Version: 0.40
Authors: Tianqi Chen, Bing Xu Authors: Tianqi Chen, Bing Xu
Early stopping by Zygmunt Zając Early stopping by Zygmunt Zając
""" """
@ -30,6 +30,9 @@ except ImportError:
class XGBoostLibraryNotFound(Exception): class XGBoostLibraryNotFound(Exception):
pass pass
class XGBoostError(Exception):
pass
__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
@ -70,6 +73,8 @@ def load_xglib():
lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float) lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
lib.XGBoosterGetModelRaw.restype = ctypes.POINTER(ctypes.c_char)
lib.XGBoosterLoadModelFromBuffer.restype = ctypes.c_void_p
return lib return lib
@ -89,6 +94,16 @@ def ctypes2numpy(cptr, length, dtype):
return res return res
def ctypes2buffer(cptr, length):
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
raise RuntimeError('expected char pointer')
res = bytearray(length)
rptr = (ctypes.c_char * length).from_buffer(res)
if not ctypes.memmove(rptr, cptr, length):
raise RuntimeError('memmove failed')
return res
def c_str(string): def c_str(string):
return ctypes.c_char_p(string.encode('utf-8')) return ctypes.c_char_p(string.encode('utf-8'))
@ -98,7 +113,7 @@ def c_array(ctype, values):
class DMatrix(object): class DMatrix(object):
def __init__(self, data, label=None, missing=0.0, weight=None): def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
""" """
Data matrix used in XGBoost. Data matrix used in XGBoost.
@ -113,14 +128,15 @@ class DMatrix(object):
Value in the data which needs to be present as a missing value. Value in the data which needs to be present as a missing value.
weight : list or numpy 1-D array (optional) weight : list or numpy 1-D array (optional)
Weight for each instance. Weight for each instance.
silent: boolean
Whether print messages during construction
""" """
# force into void_p, mac need to pass things in as void_p # force into void_p, mac need to pass things in as void_p
if data is None: if data is None:
self.handle = None self.handle = None
return return
if isinstance(data, string_types): if isinstance(data, string_types):
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0)) self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent)))
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data) self._init_from_csr(data)
elif isinstance(data, scipy.sparse.csc_matrix): elif isinstance(data, scipy.sparse.csc_matrix):
@ -335,6 +351,46 @@ class Booster(object):
def __del__(self): def __del__(self):
xglib.XGBoosterFree(self.handle) xglib.XGBoosterFree(self.handle)
def __getstate__(self):
# can't pickle ctypes pointers
# put model content in bytearray
this = self.__dict__.copy()
handle = this['handle']
if handle is not None:
raw = self.save_raw()
this["handle"] = raw
return this
def __setstate__(self, state):
# reconstruct handle from raw data
handle = state['handle']
if handle is not None:
buf = handle
dmats = c_array(ctypes.c_void_p, [])
handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0))
length = ctypes.c_ulong(len(buf))
ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length)
state['handle'] = handle
self.__dict__.update(state)
self.set_param({'seed': 0})
def __copy__(self):
return self.__deepcopy__()
def __deepcopy__(self):
return Booster(model_file = self.save_raw())
def copy(self):
"""
Copy the booster object
Returns
--------
a copied booster model
"""
return self.__copy__()
def set_param(self, params, pv=None): def set_param(self, params, pv=None):
if isinstance(params, collections.Mapping): if isinstance(params, collections.Mapping):
params = params.items() params = params.items()
@ -427,6 +483,11 @@ class Booster(object):
""" """
Predict with data. Predict with data.
NOTE: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call bst.copy() to make copies
of model object and then call predict
Parameters Parameters
---------- ----------
data : DMatrix data : DMatrix
@ -468,9 +529,25 @@ class Booster(object):
Parameters Parameters
---------- ----------
fname : string fname : string
Output file name. Output file name
""" """
if isinstance(fname, string_types): # assume file name
xglib.XGBoosterSaveModel(self.handle, c_str(fname)) xglib.XGBoosterSaveModel(self.handle, c_str(fname))
else:
raise TypeError("fname must be a string")
def save_raw(self):
"""
Save the model to a in memory buffer represetation
Returns
-------
a in memory buffer represetation of the model
"""
length = ctypes.c_ulong()
cptr = xglib.XGBoosterGetModelRaw(self.handle,
ctypes.byref(length))
return ctypes2buffer(cptr, length.value)
def load_model(self, fname): def load_model(self, fname):
""" """
@ -478,10 +555,16 @@ class Booster(object):
Parameters Parameters
---------- ----------
fname : string fname : string or a memory buffer
Input file name. Input file name or memory buffer(see also save_raw)
""" """
if isinstance(fname, str): # assume file name
xglib.XGBoosterLoadModel(self.handle, c_str(fname)) xglib.XGBoosterLoadModel(self.handle, c_str(fname))
else:
buf = fname
length = ctypes.c_ulong(len(buf))
ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
def dump_model(self, fo, fmap='', with_stats=False): def dump_model(self, fo, fmap='', with_stats=False):
""" """
@ -622,7 +705,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
maximize_score = False maximize_score = False
if 'eval_metric' in params: if 'eval_metric' in params:
maximize_metrics = ('auc', 'map', 'ndcg') maximize_metrics = ('auc', 'map', 'ndcg')
if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics): if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)):
maximize_score = True maximize_score = True
if maximize_score: if maximize_score:
@ -659,11 +742,11 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
bst.best_score = best_score bst.best_score = best_score
bst.best_iteration = best_score_i bst.best_iteration = best_score_i
break
bst.best_score = best_score
bst.best_iteration = best_score_i
return bst return bst
return bst
class CVPack(object): class CVPack(object):
def __init__(self, dtrain, dtest, param): def __init__(self, dtrain, dtest, param):
self.dtrain = dtrain self.dtrain = dtrain
@ -815,12 +898,15 @@ class XGBModel(XGBModelBase):
The initial prediction score of all instances, global bias. The initial prediction score of all instances, global bias.
seed : int seed : int
Random number seed. Random number seed.
missing : float, optional
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
""" """
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear", def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
base_score=0.5, seed=0): base_score=0.5, seed=0, missing=None):
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise Exception('sklearn needs to be installed in order to use this module') raise XGBoostError('sklearn needs to be installed in order to use this module')
self.max_depth = max_depth self.max_depth = max_depth
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.n_estimators = n_estimators self.n_estimators = n_estimators
@ -836,8 +922,37 @@ class XGBModel(XGBModelBase):
self.base_score = base_score self.base_score = base_score
self.seed = seed self.seed = seed
self.missing = missing if missing is not None else np.nan
self._Booster = Booster() self._Booster = None
def __setstate__(self, state):
# backward compatiblity code
# load booster from raw if it is raw
# the booster now support pickle
bst = state["_Booster"]
if bst is not None and not isinstance(bst, Booster):
state["_Booster"] = Booster(model_file=bst)
self.__dict__.update(state)
def booster(self):
"""
get the underlying xgboost Booster of this model
will raise an exception when fit was not called
Returns
-------
booster : a xgboost booster of underlying model
"""
if self._Booster is None:
raise XGBoostError('need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
params = super(XGBModel, self).get_params(deep=deep)
if params['missing'] is np.nan:
params['missing'] = None # sklearn doesn't handle nan. see #4725
return params
def get_xgb_params(self): def get_xgb_params(self):
xgb_params = self.get_params() xgb_params = self.get_params()
@ -849,13 +964,13 @@ class XGBModel(XGBModelBase):
return xgb_params return xgb_params
def fit(self, X, y): def fit(self, X, y):
trainDmatrix = DMatrix(X, label=y) trainDmatrix = DMatrix(X, label=y, missing=self.missing)
self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators) self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators)
return self return self
def predict(self, X): def predict(self, X):
testDmatrix = DMatrix(X) testDmatrix = DMatrix(X, missing=self.missing)
return self._Booster.predict(testDmatrix) return self.booster().predict(testDmatrix)
class XGBClassifier(XGBModel, XGBClassifier): class XGBClassifier(XGBModel, XGBClassifier):
@ -865,15 +980,15 @@ class XGBClassifier(XGBModel, XGBClassifier):
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
base_score=0.5, seed=0): base_score=0.5, seed=0, missing=None):
super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
nthread, gamma, min_child_weight, max_delta_step, subsample, nthread, gamma, min_child_weight, max_delta_step, subsample,
colsample_bytree, colsample_bytree,
base_score, seed) base_score, seed, missing)
def fit(self, X, y, sample_weight=None): def fit(self, X, y, sample_weight=None):
y_values = list(np.unique(y)) self.classes_ = list(np.unique(y))
self.n_classes_ = len(y_values) self.n_classes_ = len(self.classes_)
if self.n_classes_ > 2: if self.n_classes_ > 2:
# Switch to using a multiclass objective in the underlying XGB instance # Switch to using a multiclass objective in the underlying XGB instance
self.objective = "multi:softprob" self.objective = "multi:softprob"
@ -886,17 +1001,19 @@ class XGBClassifier(XGBModel, XGBClassifier):
training_labels = self._le.transform(y) training_labels = self._le.transform(y)
if sample_weight is not None: if sample_weight is not None:
trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight) trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
missing=self.missing)
else: else:
trainDmatrix = DMatrix(X, label=training_labels) trainDmatrix = DMatrix(X, label=training_labels,
missing=self.missing)
self._Booster = train(xgb_options, trainDmatrix, self.n_estimators) self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)
return self return self
def predict(self, X): def predict(self, X):
testDmatrix = DMatrix(X) testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self._Booster.predict(testDmatrix) class_probs = self.booster().predict(testDmatrix)
if len(class_probs.shape) > 1: if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1) column_indexes = np.argmax(class_probs, axis=1)
else: else:
@ -905,8 +1022,8 @@ class XGBClassifier(XGBModel, XGBClassifier):
return self._le.inverse_transform(column_indexes) return self._le.inverse_transform(column_indexes)
def predict_proba(self, X): def predict_proba(self, X):
testDmatrix = DMatrix(X) testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self._Booster.predict(testDmatrix) class_probs = self.booster().predict(testDmatrix)
if self.objective == "multi:softprob": if self.objective == "multi:softprob":
return class_probs return class_probs
else: else:
@ -914,7 +1031,6 @@ class XGBClassifier(XGBModel, XGBClassifier):
classzero_probs = 1.0 - classone_probs classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose() return np.vstack((classzero_probs, classone_probs)).transpose()
class XGBRegressor(XGBModel, XGBRegressor): class XGBRegressor(XGBModel, XGBRegressor):
__doc__ = """ __doc__ = """
Implementation of the scikit-learn API for XGBoost regression Implementation of the scikit-learn API for XGBoost regression

View File

@ -58,13 +58,14 @@ class Booster: public learner::BoostLearner {
} }
inline void LoadModelFromBuffer(const void *buf, size_t size) { inline void LoadModelFromBuffer(const void *buf, size_t size) {
utils::MemoryFixSizeBuffer fs((void*)buf, size); utils::MemoryFixSizeBuffer fs((void*)buf, size);
learner::BoostLearner::LoadModel(fs); learner::BoostLearner::LoadModel(fs, true);
this->init_model = true; this->init_model = true;
} }
inline const char *GetModelRaw(bst_ulong *out_len) { inline const char *GetModelRaw(bst_ulong *out_len) {
this->CheckInitModel();
model_str.resize(0); model_str.resize(0);
utils::MemoryBufferStream fs(&model_str); utils::MemoryBufferStream fs(&model_str);
learner::BoostLearner::SaveModel(fs); learner::BoostLearner::SaveModel(fs, false);
*out_len = static_cast<bst_ulong>(model_str.length()); *out_len = static_cast<bst_ulong>(model_str.length());
if (*out_len == 0) { if (*out_len == 0) {
return NULL; return NULL;
@ -322,8 +323,10 @@ extern "C"{
void XGBoosterLoadModel(void *handle, const char *fname) { void XGBoosterLoadModel(void *handle, const char *fname) {
static_cast<Booster*>(handle)->LoadModel(fname); static_cast<Booster*>(handle)->LoadModel(fname);
} }
void XGBoosterSaveModel(const void *handle, const char *fname) { void XGBoosterSaveModel(void *handle, const char *fname) {
static_cast<const Booster*>(handle)->SaveModel(fname); Booster *bst = static_cast<Booster*>(handle);
bst->CheckInitModel();
bst->SaveModel(fname, false);
} }
void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) { void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len); static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);

View File

@ -203,7 +203,7 @@ extern "C" {
* \param handle handle * \param handle handle
* \param fname file name * \param fname file name
*/ */
XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname); XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname);
/*! /*!
* \brief load model from in memory buffer * \brief load model from in memory buffer
* \param handle handle * \param handle handle