From 54fb49ee5ce5208f89217d4b689090322788263a Mon Sep 17 00:00:00 2001 From: hetong007 Date: Tue, 5 May 2015 16:31:49 -0700 Subject: [PATCH 01/60] add early stopping to R --- R-package/NAMESPACE | 2 +- R-package/R/xgb.train.R | 58 ++++++++++++++++++- R-package/R/xgboost.R | 10 +++- R-package/man/agaricus.test.Rd | 2 +- R-package/man/agaricus.train.Rd | 2 +- R-package/man/getinfo.Rd | 2 +- R-package/man/nrow-xgb.DMatrix-method.Rd | 2 +- R-package/man/predict-xgb.Booster-method.Rd | 2 +- .../man/predict-xgb.Booster.handle-method.Rd | 2 +- R-package/man/setinfo.Rd | 2 +- R-package/man/slice.Rd | 2 +- R-package/man/xgb.DMatrix.Rd | 2 +- R-package/man/xgb.DMatrix.save.Rd | 2 +- R-package/man/xgb.cv.Rd | 2 +- R-package/man/xgb.dump.Rd | 2 +- R-package/man/xgb.importance.Rd | 2 +- R-package/man/xgb.load.Rd | 2 +- R-package/man/xgb.model.dt.tree.Rd | 2 +- R-package/man/xgb.plot.importance.Rd | 2 +- R-package/man/xgb.plot.tree.Rd | 2 +- R-package/man/xgb.save.Rd | 2 +- R-package/man/xgb.save.raw.Rd | 2 +- R-package/man/xgb.train.Rd | 18 ++++-- R-package/man/xgboost.Rd | 11 +++- 24 files changed, 106 insertions(+), 31 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a4f07799a..d7f9e455c 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2 (4.1.0): do not edit by hand export(getinfo) export(setinfo) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index a99740f64..6f6c1a900 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -66,7 +66,11 @@ #' prediction and dtrain, #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both -#' +#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. +#' If set to an integer \code{k}, training with a validation set will stop if the performance +#' keeps getting worse consecutively for \code{k} rounds. +#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +#' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' #' @details @@ -114,7 +118,8 @@ #' @export #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), - obj = NULL, feval = NULL, verbose = 1, ...) { + obj = NULL, feval = NULL, verbose = 1, + earlyStopRound = NULL, maximize = NULL, ...) { dtrain <- data if (typeof(params) != "list") { stop("xgb.train: first argument params must be list") @@ -133,6 +138,33 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } params = append(params, list(...)) + # Early stopping + if (!is.null(feval) && is.null(maximize)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (length(watchlist) == 0 && !is.null(earlyStopRound)) + stop('For early stopping you need at least one set in watchlist.') + if (is.null(maximize) && is.null(params$eval_metric)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize)) + { + if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { + maximize = FALSE + } else { + maximize = TRUE + } + } + + if (maximize) { + bestScore = 0 + } else { + bestScore = Inf + } + bestInd = 0 + earlyStopflag = FALSE + + if (length(watchlist)>1 && !is.null(earlyStopRound)) + warning('Only the first data set in watchlist is used for early stopping process.') + handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) for (i in 1:nrounds) { @@ -140,8 +172,30 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) cat(paste(msg, "\n", sep="")) + if (!is.null(earlyStopRound)) + { + score = strsplit(msg,'\\s+')[[1]][1] + score = strsplit(score,':')[[1]][2] + score = as.numeric(score) + if ((maximize && score>bestScore) || (!maximize && scoreearlyStopRound) { + earlyStopflag = TRUE + } + } + } + } + if (earlyStopflag) { + cat('Stopping. Best iteration:',bestInd) + break } } bst <- xgb.Booster.check(bst) + if (!is.null(earlyStopRound)) { + bst$bestScore = bestScore + bst$bestInd = bestInd + } return(bst) } diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index ede53b116..367a149e7 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -30,6 +30,11 @@ #' performance and construction progress information #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. +#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. +#' If set to an integer \code{k}, training with a validation set will stop if the performance +#' keeps getting worse consecutively for \code{k} rounds. +#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +#' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' #' @details @@ -51,7 +56,7 @@ #' @export #' xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, - verbose = 1, ...) { + verbose = 1, earlyStopRound = NULL, maximize = NULL, ...) { if (is.null(missing)) { dtrain <- xgb.get.DMatrix(data, label) } else { @@ -66,7 +71,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), watchlist <- list() } - bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose) + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, + earlyStopRound = earlyStopRound) return(bst) } diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index c54e30ba3..556425379 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.test} diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index 955257148..879b3d5df 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.train} diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 87c507566..618d0d44b 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/getinfo.xgb.DMatrix.R \docType{methods} \name{getinfo} diff --git a/R-package/man/nrow-xgb.DMatrix-method.Rd b/R-package/man/nrow-xgb.DMatrix-method.Rd index f86709afd..953e620bf 100644 --- a/R-package/man/nrow-xgb.DMatrix-method.Rd +++ b/R-package/man/nrow-xgb.DMatrix-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/nrow.xgb.DMatrix.R \docType{methods} \name{nrow,xgb.DMatrix-method} diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 3ce2e2025..06fdb2ca8 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/predict.xgb.Booster.R \docType{methods} \name{predict,xgb.Booster-method} diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd index 7eb237a94..cc9ba29f9 100644 --- a/R-package/man/predict-xgb.Booster.handle-method.Rd +++ b/R-package/man/predict-xgb.Booster.handle-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/predict.xgb.Booster.handle.R \docType{methods} \name{predict,xgb.Booster.handle-method} diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index edf5284bd..9512f1dfb 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/setinfo.xgb.DMatrix.R \docType{methods} \name{setinfo} diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 20a78a383..a7812e886 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/slice.xgb.DMatrix.R \docType{methods} \name{slice} diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 9d4d19d37..ea644a291 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.DMatrix.R \name{xgb.DMatrix} \alias{xgb.DMatrix} diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index 3ba36f55a..6bbc277b3 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.DMatrix.save.R \name{xgb.DMatrix.save} \alias{xgb.DMatrix.save} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 19ab788f9..feee4e18f 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.cv.R \name{xgb.cv} \alias{xgb.cv} diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index eaf1ca521..124535211 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.dump.R \name{xgb.dump} \alias{xgb.dump} diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 11740e4ac..674a54622 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.importance.R \name{xgb.importance} \alias{xgb.importance} diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 1331ff249..4caef6239 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.load.R \name{xgb.load} \alias{xgb.load} diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index c53ed057f..df308a954 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.model.dt.tree.R \name{xgb.model.dt.tree} \alias{xgb.model.dt.tree} diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 4147278b9..0797b89c2 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.plot.importance.R \name{xgb.plot.importance} \alias{xgb.plot.importance} diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 4501d87ce..476dbda11 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.plot.tree.R \name{xgb.plot.tree} \alias{xgb.plot.tree} diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index eca097fac..6e6b23e54 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.save.R \name{xgb.save} \alias{xgb.save} diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 79c356c0f..94ae29416 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.save.raw.R \name{xgb.save.raw} \alias{xgb.save.raw} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 1bd243d60..6a1aa874b 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -1,11 +1,12 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgb.train.R \name{xgb.train} \alias{xgb.train} \title{eXtreme Gradient Boosting Training} \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, - feval = NULL, verbose = 1, ...) + feval = NULL, verbose = 1, earlyStopRound = NULL, maximize = NULL, + ...) } \arguments{ \item{params}{the list of parameters. @@ -49,7 +50,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item \code{binary:logistic} logistic regression for binary classification. Output probability. \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. \item \code{num_class} set the number of classes. To use only with multiclass objectives. - \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is a number and should be from 0 \code{tonum_class} + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}. \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. } @@ -75,7 +76,14 @@ gradient with given prediction and dtrain,} prediction and dtrain,} \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print - information of performance. If 2, xgboost will print information of both} +information of performance. If 2, xgboost will print information of both} + +\item{earlyStopRound}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +keeps getting worse consecutively for \code{k} rounds.} + +\item{maximize}{If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +\code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} } @@ -98,7 +106,7 @@ Number of threads can also be manually specified via \code{nthread} parameter. \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. - \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} + \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} } Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}. diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 79cff207a..bc2311a2b 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -1,11 +1,11 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand % Please edit documentation in R/xgboost.R \name{xgboost} \alias{xgboost} \title{eXtreme Gradient Boosting (Tree) library} \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, ...) + nrounds, verbose = 1, earlyStopRound = NULL, maximize = NULL, ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or @@ -41,6 +41,13 @@ Commonly used ones are: information of performance. If 2, xgboost will print information of both performance and construction progress information} +\item{earlyStopRound}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +keeps getting worse consecutively for \code{k} rounds.} + +\item{maximize}{If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +\code{maximize=TRUE} means the larger the evaluation score the better.} + \item{...}{other parameters to pass to \code{params}.} } \description{ From 0f182b0b66d95306e1ab3b16c6e1aadcfdc5256b Mon Sep 17 00:00:00 2001 From: hetong007 Date: Tue, 5 May 2015 16:44:36 -0700 Subject: [PATCH 02/60] fix logic --- R-package/R/xgb.train.R | 4 +-- R-package/demo/00Index | 1 + R-package/demo/early_Stopping.R | 58 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 R-package/demo/early_Stopping.R diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 6f6c1a900..636ad3cad 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -139,11 +139,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), params = append(params, list(...)) # Early stopping - if (!is.null(feval) && is.null(maximize)) + if (!is.null(feval) && is.null(maximize) && !is.null(earlyStopRound)) stop('Please set maximize to note whether the model is maximizing the evaluation or not.') if (length(watchlist) == 0 && !is.null(earlyStopRound)) stop('For early stopping you need at least one set in watchlist.') - if (is.null(maximize) && is.null(params$eval_metric)) + if (is.null(maximize) && is.null(params$eval_metric) && !is.null(earlyStopRound)) stop('Please set maximize to note whether the model is maximizing the evaluation or not.') if (is.null(maximize)) { diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 969da0d91..f0b41ec2a 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -6,3 +6,4 @@ generalized_linear_model Generalized Linear Model cross_validation Cross validation create_sparse_matrix Create Sparse Matrix predict_leaf_indices Predicting the corresponding leaves +early_Stopping Early Stop in training diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R new file mode 100644 index 000000000..3253c3828 --- /dev/null +++ b/R-package/demo/early_Stopping.R @@ -0,0 +1,58 @@ +require(xgboost) +# load in the agaricus dataset +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param <- list(max.depth=2,eta=1,nthread = 2, silent=1) +watchlist <- list(eval = dtest) +num_round <- 20 +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + return(list(metric = "error", value = err)) +} +print ('start training with user customized objective') +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, + earlyStopRound = 3) +# +# there can be cases where you want additional information +# being considered besides the property of DMatrix you can get by getinfo +# you can set additional information as attributes if DMatrix +# set label attribute of dtrain to be label, we use label as an example, it can be anything +attr(dtrain, 'label') <- getinfo(dtrain, 'label') +# this is new customized objective, where you can access things you set +# same thing applies to customized evaluation function +logregobjattr <- function(preds, dtrain) { + # now you can access the attribute in customized function + labels <- attr(dtrain, 'label') + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} +print ('start training with user customized objective, with additional attributes in DMatrix') +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror, maximize = FALSE, + earlyStopRound = 3) \ No newline at end of file From 11fa4197208bf3c3054c93a9609602eb7b8b9c4d Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 6 May 2015 12:33:43 -0500 Subject: [PATCH 03/60] ENH: Make XGBModel pickleable. --- wrapper/xgboost.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 8ef82b2c7..4a5248818 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -15,6 +15,7 @@ import re import ctypes import platform import collections +from tempfile import NamedTemporaryFile import numpy as np import scipy.sparse @@ -839,6 +840,31 @@ class XGBModel(XGBModelBase): self._Booster = Booster() + def __getstate__(self): + # can't pickle ctypes pointers so save _Booster directly + this = self.__dict__.copy() # don't modify in place + + # delete = False for x-platform compatibility + # https://bugs.python.org/issue14243 + with NamedTemporaryFile(mode="wb", delete=False) as tmp: + this["_Booster"].save_model(tmp.name) + tmp.close() + booster = open(tmp.name, "rb").read() + os.remove(tmp.name) + this.update({"_Booster": booster}) + + return this + + def __setstate__(self, state): + with NamedTemporaryFile(mode="wb", delete=False) as tmp: + tmp.write(state["_Booster"]) + tmp.close() + booster = Booster(model_file=tmp.name) + os.remove(tmp.name) + + state["_Booster"] = booster + self.__dict__.update(state) + def get_xgb_params(self): xgb_params = self.get_params() From 13837060f1a17aa691c1ac57969ce21f7969e0f2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 6 May 2015 14:59:14 -0500 Subject: [PATCH 04/60] ENH: Don't use tempfiles for save/load --- wrapper/xgboost.py | 55 +++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 4a5248818..48fa02b76 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -15,7 +15,7 @@ import re import ctypes import platform import collections -from tempfile import NamedTemporaryFile +from io import BytesIO import numpy as np import scipy.sparse @@ -71,6 +71,8 @@ def load_xglib(): lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float) lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) + lib.XGBoosterGetModelRaw.restype = ctypes.POINTER(ctypes.c_char) + lib.XGBoosterLoadModelFromBuffer.restype = ctypes.c_void_p return lib @@ -468,10 +470,19 @@ class Booster(object): Parameters ---------- - fname : string - Output file name. + fname : string or file handle + Output file name or handle. If a handle is given must be a BytesIO + object or a file opened for writing in binary format. """ - xglib.XGBoosterSaveModel(self.handle, c_str(fname)) + if isinstance(fname, string_types): # assume file name + xglib.XGBoosterSaveModel(self.handle, c_str(fname)) + else: + length = ctypes.c_ulong() + cptr = xglib.XGBoosterGetModelRaw(self.handle, + ctypes.byref(length)) + address = ctypes.addressof(cptr.contents) + buf = (ctypes.c_char * length.value).from_address(address) + fname.write(buf) def load_model(self, fname): """ @@ -479,10 +490,16 @@ class Booster(object): Parameters ---------- - fname : string - Input file name. + fname : string of file handle + Input file name or file handle object. """ - xglib.XGBoosterLoadModel(self.handle, c_str(fname)) + if isinstance(fname, string_types): # assume file name + xglib.XGBoosterLoadModel(self.handle, c_str(fname)) + else: + buf = fname.getbuffer() + length = ctypes.c_ulong(buf.nbytes) + ptr = ctypes.byref(ctypes.c_void_p.from_buffer(buf)) + xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length) def dump_model(self, fo, fmap='', with_stats=False): """ @@ -841,28 +858,20 @@ class XGBModel(XGBModelBase): self._Booster = Booster() def __getstate__(self): - # can't pickle ctypes pointers so save _Booster directly + # can't pickle ctypes pointers so put _Booster in a BytesIO obj + this = self.__dict__.copy() # don't modify in place - # delete = False for x-platform compatibility - # https://bugs.python.org/issue14243 - with NamedTemporaryFile(mode="wb", delete=False) as tmp: - this["_Booster"].save_model(tmp.name) - tmp.close() - booster = open(tmp.name, "rb").read() - os.remove(tmp.name) - this.update({"_Booster": booster}) + tmp = BytesIO() + this["_Booster"].save_model(tmp) + tmp.seek(0) + this["_Booster"] = tmp return this def __setstate__(self, state): - with NamedTemporaryFile(mode="wb", delete=False) as tmp: - tmp.write(state["_Booster"]) - tmp.close() - booster = Booster(model_file=tmp.name) - os.remove(tmp.name) - - state["_Booster"] = booster + booster = state["_Booster"] + state["_Booster"] = Booster(model_file=booster) self.__dict__.update(state) def get_xgb_params(self): From 419e4dbda6ac69d7c905663dcecc3d18800ed31f Mon Sep 17 00:00:00 2001 From: hetong007 Date: Wed, 6 May 2015 15:14:29 -0700 Subject: [PATCH 05/60] add demo for early_stopping in R --- R-package/R/xgb.train.R | 56 ++++++++++++++++----------------- R-package/demo/early_Stopping.R | 23 +------------- 2 files changed, 29 insertions(+), 50 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 636ad3cad..01de306a0 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -139,31 +139,34 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), params = append(params, list(...)) # Early stopping - if (!is.null(feval) && is.null(maximize) && !is.null(earlyStopRound)) - stop('Please set maximize to note whether the model is maximizing the evaluation or not.') - if (length(watchlist) == 0 && !is.null(earlyStopRound)) - stop('For early stopping you need at least one set in watchlist.') - if (is.null(maximize) && is.null(params$eval_metric) && !is.null(earlyStopRound)) - stop('Please set maximize to note whether the model is maximizing the evaluation or not.') - if (is.null(maximize)) - { - if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { - maximize = FALSE - } else { - maximize = TRUE + if (!is.null(earlyStopRound)){ + if (!is.null(feval) && is.null(maximize)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (length(watchlist) == 0) + stop('For early stopping you need at least one set in watchlist.') + if (is.null(maximize) && is.null(params$eval_metric)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize)) + { + if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { + maximize = FALSE + } else { + maximize = TRUE + } } + + if (maximize) { + bestScore = 0 + } else { + bestScore = Inf + } + bestInd = 0 + earlyStopflag = FALSE + + if (length(watchlist)>1) + warning('Only the first data set in watchlist is used for early stopping process.') } - if (maximize) { - bestScore = 0 - } else { - bestScore = Inf - } - bestInd = 0 - earlyStopflag = FALSE - - if (length(watchlist)>1 && !is.null(earlyStopRound)) - warning('Only the first data set in watchlist is used for early stopping process.') handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) @@ -174,8 +177,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), cat(paste(msg, "\n", sep="")) if (!is.null(earlyStopRound)) { - score = strsplit(msg,'\\s+')[[1]][1] - score = strsplit(score,':')[[1]][2] + score = strsplit(msg,':|\\s+')[[1]][3] score = as.numeric(score) if ((maximize && score>bestScore) || (!maximize && scoreearlyStopRound) { earlyStopflag = TRUE + cat('Stopping. Best iteration:',bestInd) + break } } } } - if (earlyStopflag) { - cat('Stopping. Best iteration:',bestInd) - break - } } bst <- xgb.Booster.check(bst) if (!is.null(earlyStopRound)) { diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R index 3253c3828..4cab385ca 100644 --- a/R-package/demo/early_Stopping.R +++ b/R-package/demo/early_Stopping.R @@ -30,29 +30,8 @@ evalerror <- function(preds, dtrain) { err <- as.numeric(sum(labels != (preds > 0)))/length(labels) return(list(metric = "error", value = err)) } -print ('start training with user customized objective') +print ('start training with early Stopping setting') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, earlyStopRound = 3) -# -# there can be cases where you want additional information -# being considered besides the property of DMatrix you can get by getinfo -# you can set additional information as attributes if DMatrix -# set label attribute of dtrain to be label, we use label as an example, it can be anything -attr(dtrain, 'label') <- getinfo(dtrain, 'label') -# this is new customized objective, where you can access things you set -# same thing applies to customized evaluation function -logregobjattr <- function(preds, dtrain) { - # now you can access the attribute in customized function - labels <- attr(dtrain, 'label') - preds <- 1/(1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} -print ('start training with user customized objective, with additional attributes in DMatrix') -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror, maximize = FALSE, - earlyStopRound = 3) \ No newline at end of file From 993d7b9da3e723eac0b1d34bb8e8d89db2f108fd Mon Sep 17 00:00:00 2001 From: hetong007 Date: Wed, 6 May 2015 15:23:37 -0700 Subject: [PATCH 06/60] update roxygen2 --- R-package/NAMESPACE | 2 +- R-package/man/agaricus.test.Rd | 2 +- R-package/man/agaricus.train.Rd | 2 +- R-package/man/getinfo.Rd | 2 +- R-package/man/nrow-xgb.DMatrix-method.Rd | 2 +- R-package/man/predict-xgb.Booster-method.Rd | 2 +- R-package/man/predict-xgb.Booster.handle-method.Rd | 2 +- R-package/man/setinfo.Rd | 2 +- R-package/man/slice.Rd | 2 +- R-package/man/xgb.DMatrix.Rd | 2 +- R-package/man/xgb.DMatrix.save.Rd | 2 +- R-package/man/xgb.cv.Rd | 2 +- R-package/man/xgb.dump.Rd | 2 +- R-package/man/xgb.importance.Rd | 2 +- R-package/man/xgb.load.Rd | 2 +- R-package/man/xgb.model.dt.tree.Rd | 2 +- R-package/man/xgb.plot.importance.Rd | 2 +- R-package/man/xgb.plot.tree.Rd | 2 +- R-package/man/xgb.save.Rd | 2 +- R-package/man/xgb.save.raw.Rd | 2 +- R-package/man/xgb.train.Rd | 2 +- R-package/man/xgboost.Rd | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index d7f9e455c..a4f07799a 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.0): do not edit by hand +# Generated by roxygen2 (4.1.1): do not edit by hand export(getinfo) export(setinfo) diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index 556425379..c54e30ba3 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.test} diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index 879b3d5df..955257148 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.train} diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 618d0d44b..87c507566 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/getinfo.xgb.DMatrix.R \docType{methods} \name{getinfo} diff --git a/R-package/man/nrow-xgb.DMatrix-method.Rd b/R-package/man/nrow-xgb.DMatrix-method.Rd index 953e620bf..f86709afd 100644 --- a/R-package/man/nrow-xgb.DMatrix-method.Rd +++ b/R-package/man/nrow-xgb.DMatrix-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/nrow.xgb.DMatrix.R \docType{methods} \name{nrow,xgb.DMatrix-method} diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 06fdb2ca8..3ce2e2025 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/predict.xgb.Booster.R \docType{methods} \name{predict,xgb.Booster-method} diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd index cc9ba29f9..7eb237a94 100644 --- a/R-package/man/predict-xgb.Booster.handle-method.Rd +++ b/R-package/man/predict-xgb.Booster.handle-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/predict.xgb.Booster.handle.R \docType{methods} \name{predict,xgb.Booster.handle-method} diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index 9512f1dfb..edf5284bd 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/setinfo.xgb.DMatrix.R \docType{methods} \name{setinfo} diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index a7812e886..20a78a383 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/slice.xgb.DMatrix.R \docType{methods} \name{slice} diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index ea644a291..9d4d19d37 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.DMatrix.R \name{xgb.DMatrix} \alias{xgb.DMatrix} diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index 6bbc277b3..3ba36f55a 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.DMatrix.save.R \name{xgb.DMatrix.save} \alias{xgb.DMatrix.save} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index feee4e18f..19ab788f9 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.cv.R \name{xgb.cv} \alias{xgb.cv} diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 124535211..eaf1ca521 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.dump.R \name{xgb.dump} \alias{xgb.dump} diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 674a54622..11740e4ac 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.importance.R \name{xgb.importance} \alias{xgb.importance} diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 4caef6239..1331ff249 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.load.R \name{xgb.load} \alias{xgb.load} diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index df308a954..c53ed057f 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.model.dt.tree.R \name{xgb.model.dt.tree} \alias{xgb.model.dt.tree} diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 0797b89c2..4147278b9 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.plot.importance.R \name{xgb.plot.importance} \alias{xgb.plot.importance} diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 476dbda11..4501d87ce 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.plot.tree.R \name{xgb.plot.tree} \alias{xgb.plot.tree} diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index 6e6b23e54..eca097fac 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.save.R \name{xgb.save} \alias{xgb.save} diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 94ae29416..79c356c0f 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.save.raw.R \name{xgb.save.raw} \alias{xgb.save.raw} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 6a1aa874b..4444b95ae 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgb.train.R \name{xgb.train} \alias{xgb.train} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index bc2311a2b..9509dbd39 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/xgboost.R \name{xgboost} \alias{xgboost} From 7f7947f31c3024435cae502d9df653036c2c5e4f Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 6 May 2015 15:43:15 -0700 Subject: [PATCH 07/60] add with pbuffer info to model, allow xgb model to be saved in a more memory compact way --- src/gbm/gbtree-inl.hpp | 8 +++++++- src/learner/learner-inl.hpp | 34 +++++++++++++++++++--------------- src/xgboost_main.cpp | 6 +++++- wrapper/xgboost_wrapper.cpp | 6 +++--- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 0a1ee4f98..c868c302a 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -64,7 +64,13 @@ class GBTree : public IGradBooster { } virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { utils::Assert(mparam.num_trees == static_cast(trees.size()), "GBTree"); - fo.Write(&mparam, sizeof(ModelParam)); + if (with_pbuffer) { + fo.Write(&mparam, sizeof(ModelParam)); + } else { + ModelParam p = mparam; + p.num_pbuffer = 0; + fo.Write(&p, sizeof(ModelParam)); + } for (size_t i = 0; i < trees.size(); ++i) { trees[i]->SaveModel(fo); } diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 9ceec969e..5a080d5b1 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -157,11 +157,9 @@ class BoostLearner : public rabit::Serializable { /*! * \brief load model from stream * \param fi input stream - * \param with_pbuffer whether to load with predict buffer * \param calc_num_feature whether call InitTrainer with calc_num_feature */ inline void LoadModel(utils::IStream &fi, - bool with_pbuffer = true, bool calc_num_feature = true) { utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0, "BoostLearner: wrong model format"); @@ -189,15 +187,15 @@ class BoostLearner : public rabit::Serializable { char tmp[32]; utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class); obj_->SetParam("num_class", tmp); - gbm_->LoadModel(fi, with_pbuffer); - if (!with_pbuffer || distributed_mode == 2) { + gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0); + if (mparam.saved_with_pbuffer == 0) { gbm_->ResetPredBuffer(pred_buffer_size); } } // rabit load model from rabit checkpoint virtual void Load(rabit::Stream *fi) { // for row split, we should not keep pbuffer - this->LoadModel(*fi, distributed_mode != 2, false); + this->LoadModel(*fi, false); } // rabit save model to rabit checkpoint virtual void Save(rabit::Stream *fo) const { @@ -218,18 +216,20 @@ class BoostLearner : public rabit::Serializable { if (header == "bs64") { utils::Base64InStream bsin(fi); bsin.InitPosition(); - this->LoadModel(bsin); + this->LoadModel(bsin, true); } else if (header == "binf") { - this->LoadModel(*fi); + this->LoadModel(*fi, true); } else { delete fi; fi = utils::IStream::Create(fname, "r"); - this->LoadModel(*fi); + this->LoadModel(*fi, true); } delete fi; } - inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const { - fo.Write(&mparam, sizeof(ModelParam)); + inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const { + ModelParam p = mparam; + p.saved_with_pbuffer = static_cast(with_pbuffer); + fo.Write(&p, sizeof(ModelParam)); fo.Write(name_obj_); fo.Write(name_gbm_); gbm_->SaveModel(fo, with_pbuffer); @@ -237,17 +237,18 @@ class BoostLearner : public rabit::Serializable { /*! * \brief save model into file * \param fname file name + * \param with_pbuffer whether save pbuffer together */ - inline void SaveModel(const char *fname) const { + inline void SaveModel(const char *fname, bool with_pbuffer) const { utils::IStream *fo = utils::IStream::Create(fname, "w"); if (save_base64 != 0 || !strcmp(fname, "stdout")) { fo->Write("bs64\t", 5); utils::Base64OutStream bout(fo); - this->SaveModel(bout); + this->SaveModel(bout, with_pbuffer); bout.Finish('\n'); } else { fo->Write("binf", 4); - this->SaveModel(*fo); + this->SaveModel(*fo, with_pbuffer); } delete fo; } @@ -442,14 +443,17 @@ class BoostLearner : public rabit::Serializable { unsigned num_feature; /* \brief number of class, if it is multi-class classification */ int num_class; + /*! \brief whether the model itself is saved with pbuffer */ + int saved_with_pbuffer; /*! \brief reserved field */ - int reserved[31]; + int reserved[30]; /*! \brief constructor */ ModelParam(void) { + std::memset(this, 0, sizeof(ModelParam)); base_score = 0.5f; num_feature = 0; num_class = 0; - std::memset(reserved, 0, sizeof(reserved)); + saved_with_pbuffer = 0; } /*! * \brief set parameters from outside diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index ad87f8879..769e3be3b 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -87,6 +87,7 @@ class BoostLearnTask { if (!strcmp("name_pred", name)) name_pred = val; if (!strcmp("dsplit", name)) data_split = val; if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); + if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val); if (!strncmp("eval[", name, 5)) { char evname[256]; utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); @@ -115,6 +116,7 @@ class BoostLearnTask { model_dir_path = "./"; data_split = "NONE"; load_part = 0; + save_with_pbuffer = 0; data = NULL; } ~BoostLearnTask(void){ @@ -241,7 +243,7 @@ class BoostLearnTask { } inline void SaveModel(const char *fname) const { if (rabit::GetRank() != 0) return; - learner.SaveModel(fname); + learner.SaveModel(fname, save_with_pbuffer != 0); } inline void SaveModel(int i) const { char fname[256]; @@ -297,6 +299,8 @@ class BoostLearnTask { int pred_margin; /*! \brief whether dump statistics along with model */ int dump_model_stats; + /*! \brief whether save prediction buffer */ + int save_with_pbuffer; /*! \brief name of feature map */ std::string name_fmap; /*! \brief name of dump file */ diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 8ec3aa3f4..be2a2001c 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -58,13 +58,13 @@ class Booster: public learner::BoostLearner { } inline void LoadModelFromBuffer(const void *buf, size_t size) { utils::MemoryFixSizeBuffer fs((void*)buf, size); - learner::BoostLearner::LoadModel(fs); + learner::BoostLearner::LoadModel(fs, true); this->init_model = true; } inline const char *GetModelRaw(bst_ulong *out_len) { model_str.resize(0); utils::MemoryBufferStream fs(&model_str); - learner::BoostLearner::SaveModel(fs); + learner::BoostLearner::SaveModel(fs, false); *out_len = static_cast(model_str.length()); if (*out_len == 0) { return NULL; @@ -323,7 +323,7 @@ extern "C"{ static_cast(handle)->LoadModel(fname); } void XGBoosterSaveModel(const void *handle, const char *fname) { - static_cast(handle)->SaveModel(fname); + static_cast(handle)->SaveModel(fname, false); } void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) { static_cast(handle)->LoadModelFromBuffer(buf, len); From ba49f82ace8c10ba0e4eb5af61d5d192eb756855 Mon Sep 17 00:00:00 2001 From: Tong He Date: Wed, 6 May 2015 15:46:15 -0700 Subject: [PATCH 08/60] update to 0.4 --- R-package/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index e1383d3ad..0a183cbe7 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,7 +1,7 @@ Package: xgboost Type: Package Title: eXtreme Gradient Boosting -Version: 0.3-4 +Version: 0.4-0 Date: 2014-12-28 Author: Tianqi Chen , Tong He , Michael Benesty Maintainer: Tong He From 594bed34e474028274cc4925fe08764b10538859 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 6 May 2015 16:42:27 -0700 Subject: [PATCH 09/60] fix saveraw --- wrapper/xgboost.py | 53 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 48fa02b76..9051061d0 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -91,7 +91,14 @@ def ctypes2numpy(cptr, length, dtype): raise RuntimeError('memmove failed') return res - +def ctypes2buffer(cptr, length): + if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): + raise RuntimeError('expected char pointer') + res = np.zeros(length, dtype='uint8') + if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): + raise RuntimeError('memmove failed') + return res + def c_str(string): return ctypes.c_char_p(string.encode('utf-8')) @@ -470,19 +477,26 @@ class Booster(object): Parameters ---------- - fname : string or file handle - Output file name or handle. If a handle is given must be a BytesIO - object or a file opened for writing in binary format. + fname : string + Output file name or handle """ if isinstance(fname, string_types): # assume file name xglib.XGBoosterSaveModel(self.handle, c_str(fname)) else: - length = ctypes.c_ulong() - cptr = xglib.XGBoosterGetModelRaw(self.handle, - ctypes.byref(length)) - address = ctypes.addressof(cptr.contents) - buf = (ctypes.c_char * length.value).from_address(address) - fname.write(buf) + raise Exception("fname must be a string") + + def save_raw(self): + """ + Save the model to a in memory buffer represetation + + Returns + ------- + a in memory buffer represetation of the model + """ + length = ctypes.c_ulong() + cptr = xglib.XGBoosterGetModelRaw(self.handle, + ctypes.byref(length)) + return ctypes2buffer(cptr, length.value) def load_model(self, fname): """ @@ -491,14 +505,14 @@ class Booster(object): Parameters ---------- fname : string of file handle - Input file name or file handle object. + Input file name or memory buffer(see also save_raw) """ - if isinstance(fname, string_types): # assume file name + if isinstance(fname, str): # assume file name xglib.XGBoosterLoadModel(self.handle, c_str(fname)) else: - buf = fname.getbuffer() - length = ctypes.c_ulong(buf.nbytes) - ptr = ctypes.byref(ctypes.c_void_p.from_buffer(buf)) + buf = fname + length = ctypes.c_ulong(len(buf)) + ptr = (ctypes.c_char * len(buf)).from_buffer(buf) xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length) def dump_model(self, fo, fmap='', with_stats=False): @@ -861,12 +875,9 @@ class XGBModel(XGBModelBase): # can't pickle ctypes pointers so put _Booster in a BytesIO obj this = self.__dict__.copy() # don't modify in place - - tmp = BytesIO() - this["_Booster"].save_model(tmp) - tmp.seek(0) - this["_Booster"] = tmp - + raw = this["_Booster"].save_raw() + this["_Booster"] = raw + return this def __setstate__(self, state): From 60bf38982559226c61ccb1b7b5072b81892de1e9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 6 May 2015 16:45:05 -0700 Subject: [PATCH 10/60] update version to be consistent with python --- wrapper/setup.py | 2 +- wrapper/xgboost.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/wrapper/setup.py b/wrapper/setup.py index 14f9e9b55..52bf1cf82 100644 --- a/wrapper/setup.py +++ b/wrapper/setup.py @@ -28,7 +28,7 @@ if len(lib_path) == 0: raise XGBoostLibraryNotFound("XGBoost library not found. Did you run " "../make?") setup(name="xgboost", - version="0.32", + version="0.40", description="Python wrappers for XGBoost: eXtreme Gradient Boosting", zip_safe=False, py_modules=['xgboost'], diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 9051061d0..40ffaa84b 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -1,8 +1,8 @@ # coding: utf-8 - """ xgboost: eXtreme Gradient Boosting library +Version: 0.40 Authors: Tianqi Chen, Bing Xu Early stopping by Zygmunt Zając """ From 2d748fb6fa70dde0d587505b8f0fb73b419dce4c Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 6 May 2015 16:46:27 -0700 Subject: [PATCH 11/60] Update xgboost.py --- wrapper/xgboost.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 40ffaa84b..ebe6a7f8b 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -478,7 +478,7 @@ class Booster(object): Parameters ---------- fname : string - Output file name or handle + Output file name """ if isinstance(fname, string_types): # assume file name xglib.XGBoosterSaveModel(self.handle, c_str(fname)) @@ -504,7 +504,7 @@ class Booster(object): Parameters ---------- - fname : string of file handle + fname : string or a memory buffer Input file name or memory buffer(see also save_raw) """ if isinstance(fname, str): # assume file name From c6c7dc0a93f047a69346ff638ce0f970ec3d6043 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 6 May 2015 17:11:39 -0700 Subject: [PATCH 12/60] Update CHANGES.md --- CHANGES.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d834ce79d..6a4fabe9f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,8 +21,16 @@ xgboost-0.3 * Add [Code Guide](src/README.md) for customizing objective function and evaluation * Add R module -in progress version +in progress 0.4 ===== -* Distributed version -* Feature importance visualization in R module, thanks to Michael Benesty -* Predict leaf inde +* Distributed version of xgboost that runs on YARN, scales to billions of examples +* Direct save/load data and model from/to S3 and HDFS +* Feature importance visualization in R module, by Michael Benesty +* Predict leaf index +* Poisson regression for counts data +* Early stopping option in training +* Native save load support in R and python + - xgboost models now can be saved using save/load in R + - xgboost python model is now pickable +* sklearn wrapper is supported in python module +* Experimental External memory version From 68444a06269013efd133ee6e5535faad203110b9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 7 May 2015 18:11:40 -0700 Subject: [PATCH 13/60] fix pkl problem --- wrapper/xgboost.py | 43 ++++++++++++++++++++++++++----------- wrapper/xgboost_wrapper.cpp | 7 ++++-- wrapper/xgboost_wrapper.h | 2 +- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 40ffaa84b..f0a516e2d 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -31,6 +31,9 @@ except ImportError: class XGBoostLibraryNotFound(Exception): pass +class XGBoostError(Exception): + pass + __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: @@ -483,7 +486,7 @@ class Booster(object): if isinstance(fname, string_types): # assume file name xglib.XGBoosterSaveModel(self.handle, c_str(fname)) else: - raise Exception("fname must be a string") + raise TypeError("fname must be a string") def save_raw(self): """ @@ -852,7 +855,7 @@ class XGBModel(XGBModelBase): nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, base_score=0.5, seed=0): if not SKLEARN_INSTALLED: - raise Exception('sklearn needs to be installed in order to use this module') + raise XGBError('sklearn needs to be installed in order to use this module') self.max_depth = max_depth self.learning_rate = learning_rate self.n_estimators = n_estimators @@ -869,22 +872,36 @@ class XGBModel(XGBModelBase): self.base_score = base_score self.seed = seed - self._Booster = Booster() + self._Booster = None def __getstate__(self): # can't pickle ctypes pointers so put _Booster in a BytesIO obj - - this = self.__dict__.copy() # don't modify in place - raw = this["_Booster"].save_raw() - this["_Booster"] = raw - + this = self.__dict__.copy() # don't modify in place + bst = this["_Booster"] + if bst is not None: + raw = this["_Booster"].save_raw() + this["_Booster"] = raw return this def __setstate__(self, state): - booster = state["_Booster"] - state["_Booster"] = Booster(model_file=booster) + bst = state["_Booster"] + if bst is not None: + state["_Booster"] = Booster(model_file=booster) self.__dict__.update(state) + def booster(self): + """ + get the underlying xgboost Booster of this model + will raise an exception when fit was not called + + Returns + ------- + booster : a xgboost booster of underlying model + """ + if self._Booster is None: + raise XGBError('need to call fit beforehand') + return self._Booster + def get_xgb_params(self): xgb_params = self.get_params() @@ -901,7 +918,7 @@ class XGBModel(XGBModelBase): def predict(self, X): testDmatrix = DMatrix(X) - return self._Booster.predict(testDmatrix) + return self.booster().predict(testDmatrix) class XGBClassifier(XGBModel, XGBClassifier): @@ -942,7 +959,7 @@ class XGBClassifier(XGBModel, XGBClassifier): def predict(self, X): testDmatrix = DMatrix(X) - class_probs = self._Booster.predict(testDmatrix) + class_probs = self.booster().predict(testDmatrix) if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) else: @@ -952,7 +969,7 @@ class XGBClassifier(XGBModel, XGBClassifier): def predict_proba(self, X): testDmatrix = DMatrix(X) - class_probs = self._Booster.predict(testDmatrix) + class_probs = self.booster().predict(testDmatrix) if self.objective == "multi:softprob": return class_probs else: diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index be2a2001c..4d7828faf 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -62,6 +62,7 @@ class Booster: public learner::BoostLearner { this->init_model = true; } inline const char *GetModelRaw(bst_ulong *out_len) { + this->CheckInitModel(); model_str.resize(0); utils::MemoryBufferStream fs(&model_str); learner::BoostLearner::SaveModel(fs, false); @@ -322,8 +323,10 @@ extern "C"{ void XGBoosterLoadModel(void *handle, const char *fname) { static_cast(handle)->LoadModel(fname); } - void XGBoosterSaveModel(const void *handle, const char *fname) { - static_cast(handle)->SaveModel(fname, false); + void XGBoosterSaveModel(void *handle, const char *fname) { + Booster *bst = static_cast(handle); + bst->CheckInitModel(); + bst->SaveModel(fname, false); } void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) { static_cast(handle)->LoadModelFromBuffer(buf, len); diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index f1d2cc92a..88a327d0d 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -203,7 +203,7 @@ extern "C" { * \param handle handle * \param fname file name */ - XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname); + XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname); /*! * \brief load model from in memory buffer * \param handle handle From a4de0ebcd4e5a5b699794d3b78471b44890fea53 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 7 May 2015 18:21:15 -0700 Subject: [PATCH 14/60] change numpy to bytearray as buffer --- wrapper/xgboost.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index f65213955..ed0b1c2df 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -97,11 +97,12 @@ def ctypes2numpy(cptr, length, dtype): def ctypes2buffer(cptr, length): if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): raise RuntimeError('expected char pointer') - res = np.zeros(length, dtype='uint8') - if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): + res = bytearray(length) + rptr = (ctypes.c_char * length).from_buffer(res) + if not ctypes.memmove(rptr, cptr, length): raise RuntimeError('memmove failed') return res - + def c_str(string): return ctypes.c_char_p(string.encode('utf-8')) @@ -886,7 +887,7 @@ class XGBModel(XGBModelBase): def __setstate__(self, state): bst = state["_Booster"] if bst is not None: - state["_Booster"] = Booster(model_file=booster) + state["_Booster"] = Booster(model_file=bst) self.__dict__.update(state) def booster(self): From 3a534d264dfef77ac0184e464388060207ad56ff Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 9 May 2015 17:39:45 -0700 Subject: [PATCH 15/60] fix wrapper gc bug --- R-package/src/xgboost_R.cpp | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 15957480c..de6ed339f 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -70,10 +70,10 @@ extern "C" { SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { _WrapperBegin(); void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent)); + _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - _WrapperEnd(); return ret; } SEXP XGDMatrixCreateFromMat_R(SEXP mat, @@ -91,10 +91,10 @@ extern "C" { } } void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing)); + _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - _WrapperEnd(); return ret; } SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, @@ -120,10 +120,10 @@ extern "C" { } void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), BeginPtr(data_), nindptr, ndata); + _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - _WrapperEnd(); return ret; } SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { @@ -134,10 +134,10 @@ extern "C" { idxvec[i] = INTEGER(idxset)[i] - 1; } void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len); + _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); - _WrapperEnd(); return ret; } void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { @@ -157,10 +157,7 @@ extern "C" { vec[i] = static_cast(INTEGER(array)[i]); } XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len); - _WrapperEnd(); - return; - } - { + } else { std::vector vec(len); #pragma omp parallel for schedule(static) for (int i = 0; i < len; ++i) { @@ -177,12 +174,12 @@ extern "C" { bst_ulong olen; const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen); + _WrapperEnd(); SEXP ret = PROTECT(allocVector(REALSXP, olen)); for (size_t i = 0; i < olen; ++i) { REAL(ret)[i] = res[i]; } UNPROTECT(1); - _WrapperEnd(); return ret; } SEXP XGDMatrixNumRow_R(SEXP handle) { @@ -203,10 +200,10 @@ extern "C" { dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i))); } void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size()); + _WrapperEnd(); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); UNPROTECT(1); - _WrapperEnd(); return ret; } void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { @@ -252,10 +249,12 @@ extern "C" { for (int i = 0; i < len; ++i) { vec_sptr.push_back(vec_names[i].c_str()); } + const char *ret = + XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), + asInteger(iter), + BeginPtr(vec_dmats), BeginPtr(vec_sptr), len); _WrapperEnd(); - return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle), - asInteger(iter), - BeginPtr(vec_dmats), BeginPtr(vec_sptr), len)); + return mkString(ret); } SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) { _WrapperBegin(); @@ -265,12 +264,12 @@ extern "C" { asInteger(option_mask), asInteger(ntree_limit), &olen); + _WrapperEnd(); SEXP ret = PROTECT(allocVector(REALSXP, olen)); for (size_t i = 0; i < olen; ++i) { REAL(ret)[i] = res[i]; } UNPROTECT(1); - _WrapperEnd(); return ret; } void XGBoosterLoadModel_R(SEXP handle, SEXP fname) { @@ -305,17 +304,18 @@ extern "C" { SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { _WrapperBegin(); bst_ulong olen; - const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), - CHAR(asChar(fmap)), - asInteger(with_stats), - &olen); + const char **res = + XGBoosterDumpModel(R_ExternalPtrAddr(handle), + CHAR(asChar(fmap)), + asInteger(with_stats), + &olen); + _WrapperEnd(); SEXP out = PROTECT(allocVector(STRSXP, olen)); for (size_t i = 0; i < olen; ++i) { stringstream stream; stream << "booster["< Date: Sat, 9 May 2015 18:08:47 -0700 Subject: [PATCH 16/60] support both early stop name --- R-package/R/xgb.train.R | 18 +++++++++++------- R-package/R/xgboost.R | 11 +++++++---- R-package/man/xgb.train.Rd | 10 ++++++---- R-package/man/xgboost.Rd | 9 ++++++--- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 286b16987..f6e60e23d 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -66,10 +66,11 @@ #' prediction and dtrain, #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both -#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. +#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. -#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +#' @param early.stop.round An alternative of \code{early_stop_round}. +#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' @@ -119,7 +120,8 @@ #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, - earlyStopRound = NULL, maximize = NULL, ...) { + early_stop_round = NULL, early.stop.round = NULL, + maximize = NULL, ...) { dtrain <- data if (typeof(params) != "list") { stop("xgb.train: first argument params must be list") @@ -139,7 +141,9 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), params = append(params, list(...)) # Early stopping - if (!is.null(earlyStopRound)){ + if (is.null(early_stop_round) && !is.null(early.stop.round)) + early_stop_round = early.stop.round + if (!is.null(early_stop_round)){ if (!is.null(feval) && is.null(maximize)) stop('Please set maximize to note whether the model is maximizing the evaluation or not.') if (length(watchlist) == 0) @@ -175,7 +179,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) cat(paste(msg, "\n", sep="")) - if (!is.null(earlyStopRound)) + if (!is.null(early_stop_round)) { score = strsplit(msg,':|\\s+')[[1]][3] score = as.numeric(score) @@ -183,7 +187,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestScore = score bestInd = i } else { - if (i-bestInd>earlyStopRound) { + if (i-bestInd>early_stop_round) { earlyStopflag = TRUE cat('Stopping. Best iteration:',bestInd) break @@ -193,7 +197,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } } bst <- xgb.Booster.check(bst) - if (!is.null(earlyStopRound)) { + if (!is.null(early_stop_round)) { bst$bestScore = bestScore bst$bestInd = bestInd } diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 367a149e7..4e4fbaa2c 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -30,10 +30,11 @@ #' performance and construction progress information #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. -#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. +#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. -#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +#' @param early.stop.round An alternative of \code{early_stop_round}. +#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' @@ -56,7 +57,8 @@ #' @export #' xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, - verbose = 1, earlyStopRound = NULL, maximize = NULL, ...) { + verbose = 1, early_stop_round = NULL, early.stop.round = NULL, + maximize = NULL, ...) { if (is.null(missing)) { dtrain <- xgb.get.DMatrix(data, label) } else { @@ -72,7 +74,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), } bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, - earlyStopRound = earlyStopRound) + early_stop_round = early_stop_round, + early.stop.round = early.stop.round) return(bst) } diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index fa5ede220..a3317f1ab 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -5,8 +5,8 @@ \title{eXtreme Gradient Boosting Training} \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, - feval = NULL, verbose = 1, earlyStopRound = NULL, maximize = NULL, - ...) + feval = NULL, verbose = 1, early_stop_round = NULL, + early.stop.round = NULL, maximize = NULL, ...) } \arguments{ \item{params}{the list of parameters. @@ -78,11 +78,13 @@ prediction and dtrain,} \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print information of performance. If 2, xgboost will print information of both} -\item{earlyStopRound}{If \code{NULL}, the early stopping function is not triggered. +\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} -\item{maximize}{If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +\item{early.stop.round}{An alternative of \code{early_stop_round}.} + +\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 9509dbd39..01c519e2e 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -5,7 +5,8 @@ \title{eXtreme Gradient Boosting (Tree) library} \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, earlyStopRound = NULL, maximize = NULL, ...) + nrounds, verbose = 1, early_stop_round = NULL, early.stop.round = NULL, + maximize = NULL, ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or @@ -41,11 +42,13 @@ Commonly used ones are: information of performance. If 2, xgboost will print information of both performance and construction progress information} -\item{earlyStopRound}{If \code{NULL}, the early stopping function is not triggered. +\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} -\item{maximize}{If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well. +\item{early.stop.round}{An alternative of \code{early_stop_round}.} + +\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} From 08848ab3eeaa042bd90d2e4140f56aa08b682599 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 10 May 2015 17:45:20 -0700 Subject: [PATCH 17/60] Update README.md --- README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 093d8294d..333bd4fa7 100644 --- a/README.md +++ b/README.md @@ -37,16 +37,15 @@ What's New Features ======== -* Sparse feature format: - - Sparse feature format allows easy handling of missing values, and improve computation efficiency. -* Push the limit on single machine: - - Efficient implementation that optimizes memory and computation. -* Speed: XGBoost is very fast - - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier -* Layout of gradient boosting algorithm to support user defined objective -* Distributed and portable - - The distributed version of xgboost is highly portable and can be used in different platforms - - It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing. +* Easily accessible in python, R, Julia, CLI +* Fast speed and memory efficient + - Can be more than 10 times faster than GBM in sklearn and R + - Handles sparse matrices, support external memory +* Accurate prediction, and used extensively by data scientists and kagglers + - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) +* Distributed and Portable + - The distributed version runs on Hadoop (YARN), MPI, SGE etc. + - Scales to billions of examples and beyond Build ======= From 932af821c557c910e42ca22b66fa895f3fbcbdb2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 09:25:55 -0500 Subject: [PATCH 18/60] CLN: Remove unused import. Fix comment. --- wrapper/xgboost.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index ed0b1c2df..25ff8b1b2 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -15,7 +15,6 @@ import re import ctypes import platform import collections -from io import BytesIO import numpy as np import scipy.sparse @@ -492,7 +491,7 @@ class Booster(object): def save_raw(self): """ Save the model to a in memory buffer represetation - + Returns ------- a in memory buffer represetation of the model @@ -876,12 +875,12 @@ class XGBModel(XGBModelBase): self._Booster = None def __getstate__(self): - # can't pickle ctypes pointers so put _Booster in a BytesIO obj - this = self.__dict__.copy() # don't modify in place + # can't pickle ctypes pointers so put _Booster in a bytearray object + this = self.__dict__.copy() # don't modify in place bst = this["_Booster"] if bst is not None: raw = this["_Booster"].save_raw() - this["_Booster"] = raw + this["_Booster"] = raw return this def __setstate__(self, state): @@ -894,7 +893,7 @@ class XGBModel(XGBModelBase): """ get the underlying xgboost Booster of this model will raise an exception when fit was not called - + Returns ------- booster : a xgboost booster of underlying model @@ -902,7 +901,7 @@ class XGBModel(XGBModelBase): if self._Booster is None: raise XGBError('need to call fit beforehand') return self._Booster - + def get_xgb_params(self): xgb_params = self.get_params() From 99c2df99137bdf4e2bda28584001b33b86f672a4 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 14:34:37 -0500 Subject: [PATCH 19/60] EX: Show example of pickling and parallel use. --- demo/guide-python/sklearn_examples.py | 122 ++++++++++++++++---------- 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index dd0620a7c..b378d28cc 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -4,60 +4,86 @@ Created on 1 Apr 2015 @author: Jamie Hall ''' +if __name__ == "__main__": + # NOTE: This *has* to be here and in the `__name__ == "__main__"` clause + # to run XGBoost in parallel, if XGBoost was built with OpenMP support. + # Otherwise, you can use fork, which is the default backend for joblib, + # and omit this. + from multiprocessing import set_start_method + set_start_method("forkserver") -import xgboost as xgb + import pickle + import os + import xgboost as xgb -import numpy as np -from sklearn.cross_validation import KFold -from sklearn.grid_search import GridSearchCV -from sklearn.metrics import confusion_matrix, mean_squared_error -from sklearn.datasets import load_iris, load_digits, load_boston + import numpy as np + from sklearn.cross_validation import KFold + from sklearn.grid_search import GridSearchCV + from sklearn.metrics import confusion_matrix, mean_squared_error + from sklearn.datasets import load_iris, load_digits, load_boston -rng = np.random.RandomState(31337) + rng = np.random.RandomState(31337) + print("Zeros and Ones from the Digits dataset: binary classification") + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) -print("Zeros and Ones from the Digits dataset: binary classification") -digits = load_digits(2) -y = digits['target'] -X = digits['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) + print("Iris: multiclass classification") + iris = load_iris() + y = iris['target'] + X = iris['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) -print("Iris: multiclass classification") -iris = load_iris() -y = iris['target'] -X = iris['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) + print("Boston Housing: regression") + boston = load_boston() + y = boston['target'] + X = boston['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(mean_squared_error(actuals, predictions)) -print("Boston Housing: regression") -boston = load_boston() -y = boston['target'] -X = boston['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(mean_squared_error(actuals, predictions)) - -print("Parameter optimization") -y = boston['target'] -X = boston['data'] -xgb_model = xgb.XGBRegressor() -clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1) -clf.fit(X,y) -print(clf.best_score_) -print(clf.best_params_) + print("Parameter optimization") + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1) + clf.fit(X,y) + print(clf.best_score_) + print(clf.best_params_) + # The sklearn API models are picklable + print("Pickling sklearn API models") + # must open in binary format to pickle + pickle.dump(clf, open("best_boston.pkl", "wb")) + clf2 = pickle.load(open("best_boston.pkl", "rb")) + print(np.allclose(clf.predict(X), clf2.predict(X))) + print("Parallel Parameter optimization") + os.environ["OMP_NUM_THREADS"] = "1" + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1, + n_jobs=2) + clf.fit(X, y) + print(clf.best_score_) + print(clf.best_params_) From fa8c6e2f0b5f04b8b5b9c7fb12ed16ca46a9dc77 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 14:34:58 -0500 Subject: [PATCH 20/60] DOC: Add warning about fork + openmp --- wrapper/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wrapper/README.md b/wrapper/README.md index 9c0399693..b863353b1 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -7,6 +7,8 @@ Python * To make the python module, type ```./build.sh``` in the root directory of project * Install with `python setup.py install` from this directory. * Refer also to the walk through example in [demo folder](../demo/guide-python) +* **NOTE**: if you want to run XGBoost in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_examples.py demo. + R ===== From 15ea00540a7c31d208d6a11c096b7a172ddadad2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 11 May 2015 09:30:51 -0500 Subject: [PATCH 21/60] EX: Make separate example for fork issue. --- demo/guide-python/sklearn_examples.py | 130 +++++++++++--------------- demo/guide-python/sklearn_parallel.py | 35 +++++++ 2 files changed, 89 insertions(+), 76 deletions(-) create mode 100644 demo/guide-python/sklearn_parallel.py diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index b378d28cc..ce8c8d01e 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -4,86 +4,64 @@ Created on 1 Apr 2015 @author: Jamie Hall ''' -if __name__ == "__main__": - # NOTE: This *has* to be here and in the `__name__ == "__main__"` clause - # to run XGBoost in parallel, if XGBoost was built with OpenMP support. - # Otherwise, you can use fork, which is the default backend for joblib, - # and omit this. - from multiprocessing import set_start_method - set_start_method("forkserver") +import pickle +import xgboost as xgb - import pickle - import os - import xgboost as xgb +import numpy as np +from sklearn.cross_validation import KFold +from sklearn.metrics import confusion_matrix, mean_squared_error +from sklearn.grid_search import GridSearchCV +from sklearn.datasets import load_iris, load_digits, load_boston - import numpy as np - from sklearn.cross_validation import KFold - from sklearn.grid_search import GridSearchCV - from sklearn.metrics import confusion_matrix, mean_squared_error - from sklearn.datasets import load_iris, load_digits, load_boston +rng = np.random.RandomState(31337) - rng = np.random.RandomState(31337) +print("Zeros and Ones from the Digits dataset: binary classification") +digits = load_digits(2) +y = digits['target'] +X = digits['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) - print("Zeros and Ones from the Digits dataset: binary classification") - digits = load_digits(2) - y = digits['target'] - X = digits['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) +print("Iris: multiclass classification") +iris = load_iris() +y = iris['target'] +X = iris['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) - print("Iris: multiclass classification") - iris = load_iris() - y = iris['target'] - X = iris['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) +print("Boston Housing: regression") +boston = load_boston() +y = boston['target'] +X = boston['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(mean_squared_error(actuals, predictions)) - print("Boston Housing: regression") - boston = load_boston() - y = boston['target'] - X = boston['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(mean_squared_error(actuals, predictions)) +print("Parameter optimization") +y = boston['target'] +X = boston['data'] +xgb_model = xgb.XGBRegressor() +clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1) +clf.fit(X,y) +print(clf.best_score_) +print(clf.best_params_) - print("Parameter optimization") - y = boston['target'] - X = boston['data'] - xgb_model = xgb.XGBRegressor() - clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1) - clf.fit(X,y) - print(clf.best_score_) - print(clf.best_params_) - - # The sklearn API models are picklable - print("Pickling sklearn API models") - # must open in binary format to pickle - pickle.dump(clf, open("best_boston.pkl", "wb")) - clf2 = pickle.load(open("best_boston.pkl", "rb")) - print(np.allclose(clf.predict(X), clf2.predict(X))) - - print("Parallel Parameter optimization") - os.environ["OMP_NUM_THREADS"] = "1" - y = boston['target'] - X = boston['data'] - xgb_model = xgb.XGBRegressor() - clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1, - n_jobs=2) - clf.fit(X, y) - print(clf.best_score_) - print(clf.best_params_) +# The sklearn API models are picklable +print("Pickling sklearn API models") +# must open in binary format to pickle +pickle.dump(clf, open("best_boston.pkl", "wb")) +clf2 = pickle.load(open("best_boston.pkl", "rb")) +print(np.allclose(clf.predict(X), clf2.predict(X))) diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py new file mode 100644 index 000000000..803f3fac8 --- /dev/null +++ b/demo/guide-python/sklearn_parallel.py @@ -0,0 +1,35 @@ +import os + +if __name__ == "__main__": + # NOTE: on posix systems, this *has* to be here and in the + # `__name__ == "__main__"` clause to run XGBoost in parallel processes + # using fork, if XGBoost was built with OpenMP support. Otherwise, if you + # build XGBoost without OpenMP support, you can use fork, which is the + # default backend for joblib, and omit this. + try: + from multiprocessing import set_start_method + except ImportError: + raise ImportError("Unable to import multiprocessing.set_start_method." + " This example only runs on Python 3.4") + set_start_method("forkserver") + + import numpy as np + from sklearn.grid_search import GridSearchCV + from sklearn.datasets import load_boston + import xgboost as xgb + + rng = np.random.RandomState(31337) + + print("Parallel Parameter optimization") + boston = load_boston() + + os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], + 'n_estimators': [50, 100, 200]}, verbose=1, + n_jobs=2) + clf.fit(X, y) + print(clf.best_score_) + print(clf.best_params_) From 9c0ba6708894afd345e0ffbc4b173b26696d859d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 11 May 2015 08:45:59 -0700 Subject: [PATCH 22/60] Update README.md --- wrapper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrapper/README.md b/wrapper/README.md index b863353b1..ab013faf6 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -7,7 +7,7 @@ Python * To make the python module, type ```./build.sh``` in the root directory of project * Install with `python setup.py install` from this directory. * Refer also to the walk through example in [demo folder](../demo/guide-python) -* **NOTE**: if you want to run XGBoost in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_examples.py demo. +* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. R From 5dacab0e22d83efc2506e270036d74f61236d140 Mon Sep 17 00:00:00 2001 From: by321 Date: Fri, 8 May 2015 00:01:09 -0700 Subject: [PATCH 23/60] new parameter in xgboost() and xgb.train() to print every N-th progress message --- R-package/R/xgb.train.R | 7 +++++-- R-package/R/xgboost.R | 5 +++-- R-package/man/xgb.train.Rd | 4 +++- R-package/man/xgboost.Rd | 4 +++- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index f6e60e23d..fbb3802d2 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -66,6 +66,7 @@ #' prediction and dtrain, #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both +#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. @@ -119,7 +120,7 @@ #' @export #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), - obj = NULL, feval = NULL, verbose = 1, + obj = NULL, feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { dtrain <- data @@ -174,11 +175,13 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) + printEveryN=max( as.integer(printEveryN), 1L) for (i in 1:nrounds) { succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) - cat(paste(msg, "\n", sep="")) + if (0== ( (i-1) %% printEveryN)) + cat(paste(msg, "\n", sep="")) if (!is.null(early_stop_round)) { score = strsplit(msg,':|\\s+')[[1]][3] diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 4e4fbaa2c..f4aa5f142 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -28,6 +28,7 @@ #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both #' performance and construction progress information +#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. @@ -57,7 +58,7 @@ #' @export #' xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, - verbose = 1, early_stop_round = NULL, early.stop.round = NULL, + verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { if (is.null(missing)) { dtrain <- xgb.get.DMatrix(data, label) @@ -73,7 +74,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), watchlist <- list() } - bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, printEveryN=printEveryN, early_stop_round = early_stop_round, early.stop.round = early.stop.round) diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index a3317f1ab..74dca1c49 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -5,7 +5,7 @@ \title{eXtreme Gradient Boosting Training} \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, - feval = NULL, verbose = 1, early_stop_round = NULL, + feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) } \arguments{ @@ -78,6 +78,8 @@ prediction and dtrain,} \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print information of performance. If 2, xgboost will print information of both} +\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} + \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 01c519e2e..bf9cb5c91 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -5,7 +5,7 @@ \title{eXtreme Gradient Boosting (Tree) library} \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, early_stop_round = NULL, early.stop.round = NULL, + nrounds, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) } \arguments{ @@ -42,6 +42,8 @@ Commonly used ones are: information of performance. If 2, xgboost will print information of both performance and construction progress information} +\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} + \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} From 60d307c44513a803069bfa705eebed3f28c7b972 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 11 May 2015 15:21:54 -0700 Subject: [PATCH 24/60] add poisson demo --- R-package/demo/00Index | 1 + R-package/demo/poisson_regression.R | 7 +++++++ R-package/demo/runall.R | 2 ++ 3 files changed, 10 insertions(+) create mode 100644 R-package/demo/poisson_regression.R diff --git a/R-package/demo/00Index b/R-package/demo/00Index index f0b41ec2a..43df3ae61 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -7,3 +7,4 @@ cross_validation Cross validation create_sparse_matrix Create Sparse Matrix predict_leaf_indices Predicting the corresponding leaves early_Stopping Early Stop in training +poisson_regression Poisson Regression on count data diff --git a/R-package/demo/poisson_regression.R b/R-package/demo/poisson_regression.R new file mode 100644 index 000000000..f9dc4ac62 --- /dev/null +++ b/R-package/demo/poisson_regression.R @@ -0,0 +1,7 @@ +data(mtcars) +head(mtcars) +bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11], + objective='count:poisson',nrounds=5) +pred = predict(bst,as.matrix(mtcars[,-11])) +sqrt(mean((pred-mtcars[,11])^2)) + diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 2d0384156..2ea4c446e 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -7,3 +7,5 @@ demo(generalized_linear_model) demo(cross_validation) demo(create_sparse_matrix) demo(predict_leaf_indices) +demo(early_Stopping) +demo(poisson_regression) From 83ace55f51d455ee4c2693cc8bf9090c2fd38c8a Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 11 May 2015 16:03:40 -0700 Subject: [PATCH 25/60] add early stopping to xgb.cv --- R-package/R/xgb.cv.R | 59 +++++++++++++++++++++++++++++++-- R-package/demo/early_Stopping.R | 2 ++ R-package/man/xgb.cv.Rd | 12 ++++++- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index e5f5c7b72..2dee9d9f8 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -54,6 +54,13 @@ #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). #' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process +#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. +#' If set to an integer \code{k}, training with a validation set will stop if the performance +#' keeps getting worse consecutively for \code{k} rounds. +#' @param early.stop.round An alternative of \code{early_stop_round}. +#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +#' \code{maximize=TRUE} means the larger the evaluation score the better. +#' #' @param ... other parameters to pass to \code{params}. #' #' @return @@ -86,7 +93,8 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, + early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -109,7 +117,36 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (mc in metrics) { params <- append(params, list("eval_metric"=mc)) } - + + # Early Stopping + if (is.null(early_stop_round) && !is.null(early.stop.round)) + early_stop_round = early.stop.round + if (!is.null(early_stop_round)){ + if (!is.null(feval) && is.null(maximize)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize) && is.null(params$eval_metric)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize)) + { + if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { + maximize = FALSE + } else { + maximize = TRUE + } + } + + if (maximize) { + bestScore = 0 + } else { + bestScore = Inf + } + bestInd = 0 + earlyStopflag = FALSE + + if (length(metrics)>1) + warning('Only the first metric is used for early stopping process.') + } + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) obj_type = params[['objective']] mat_pred = FALSE @@ -149,6 +186,24 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) if(verbose) paste(ret, "\n", sep="") %>% cat + + # early_Stopping + if (!is.null(early_stop_round)){ + score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1] + score = strsplit(score,'\\+|:')[[1]][[2]] + score = as.numeric(score) + if ((maximize && score>bestScore) || (!maximize && scoreearly_stop_round) { + earlyStopflag = TRUE + cat('Stopping. Best iteration:',bestInd) + break + } + } + } + } colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R index 4cab385ca..692b413aa 100644 --- a/R-package/demo/early_Stopping.R +++ b/R-package/demo/early_Stopping.R @@ -35,3 +35,5 @@ print ('start training with early Stopping setting') # simply look at xgboost.py's implementation of train bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, earlyStopRound = 3) +bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, + maximize = FALSE, earlyStopRound = 3) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 19ab788f9..a2cd68c92 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -7,7 +7,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = T, ...) + verbose = T, early_stop_round = NULL, early.stop.round = NULL, + maximize = NULL, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -65,6 +66,15 @@ If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process} +\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +keeps getting worse consecutively for \code{k} rounds.} + +\item{early.stop.round}{An alternative of \code{early_stop_round}.} + +\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. + \code{maximize=TRUE} means the larger the evaluation score the better.} + \item{...}{other parameters to pass to \code{params}.} } \value{ From 90096e718ca66453befe159f2c2d7212e07d6e74 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 11 May 2015 16:53:51 -0700 Subject: [PATCH 26/60] fix early stopping --- R-package/R/xgb.cv.R | 2 +- R-package/R/xgb.train.R | 2 +- R-package/demo/early_Stopping.R | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 2dee9d9f8..cc0e42a93 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -196,7 +196,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bestScore = score bestInd = i } else { - if (i-bestInd>early_stop_round) { + if (i-bestInd>=early_stop_round) { earlyStopflag = TRUE cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index f6e60e23d..8ed0099a2 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -187,7 +187,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestScore = score bestInd = i } else { - if (i-bestInd>early_stop_round) { + if (i-bestInd>=early_stop_round) { earlyStopflag = TRUE cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R index 692b413aa..34dfebc0b 100644 --- a/R-package/demo/early_Stopping.R +++ b/R-package/demo/early_Stopping.R @@ -34,6 +34,6 @@ print ('start training with early Stopping setting') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, - earlyStopRound = 3) + early.stop.round = 3) bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, - maximize = FALSE, earlyStopRound = 3) + maximize = FALSE, early.stop.round = 3) From cfdd6029a8a01883ab940f3ca0cbf790d97060ee Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 11 May 2015 16:59:18 -0700 Subject: [PATCH 27/60] rename demo of early stopping --- R-package/demo/00Index | 2 +- R-package/demo/early_stopping.R | 39 +++++++++++++++++++++++++++++++++ R-package/demo/runall.R | 2 +- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 R-package/demo/early_stopping.R diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 43df3ae61..0112eb9e1 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -6,5 +6,5 @@ generalized_linear_model Generalized Linear Model cross_validation Cross validation create_sparse_matrix Create Sparse Matrix predict_leaf_indices Predicting the corresponding leaves -early_Stopping Early Stop in training +early_stopping Early Stop in training poisson_regression Poisson Regression on count data diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R new file mode 100644 index 000000000..34dfebc0b --- /dev/null +++ b/R-package/demo/early_stopping.R @@ -0,0 +1,39 @@ +require(xgboost) +# load in the agaricus dataset +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param <- list(max.depth=2,eta=1,nthread = 2, silent=1) +watchlist <- list(eval = dtest) +num_round <- 20 +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + return(list(metric = "error", value = err)) +} +print ('start training with early Stopping setting') +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, + early.stop.round = 3) +bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, + maximize = FALSE, early.stop.round = 3) diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 2ea4c446e..7311ec95e 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -7,5 +7,5 @@ demo(generalized_linear_model) demo(cross_validation) demo(create_sparse_matrix) demo(predict_leaf_indices) -demo(early_Stopping) +demo(early_stopping) demo(poisson_regression) From c05cc48dfaefc2fc37bbac8d09863c4c0baae3d5 Mon Sep 17 00:00:00 2001 From: hetong Date: Mon, 11 May 2015 20:55:09 -0700 Subject: [PATCH 28/60] delete abundant file --- R-package/demo/early_Stopping.R | 39 --------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 R-package/demo/early_Stopping.R diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R deleted file mode 100644 index 34dfebc0b..000000000 --- a/R-package/demo/early_Stopping.R +++ /dev/null @@ -1,39 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -param <- list(max.depth=2,eta=1,nthread = 2, silent=1) -watchlist <- list(eval = dtest) -num_round <- 20 -# user define objective function, given prediction, return gradient and second order gradient -# this is loglikelihood loss -logregobj <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} -# user defined evaluation function, return a pair metric_name, result -# NOTE: when you do customized loss function, the default prediction value is margin -# this may make buildin evalution metric not function properly -# for example, we are doing logistic loss, the prediction is score before logistic transformation -# the buildin evaluation error assumes input is after logistic transformation -# Take this in mind when you use the customization, and maybe you need write customized evaluation function -evalerror <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0)))/length(labels) - return(list(metric = "error", value = err)) -} -print ('start training with early Stopping setting') -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, - early.stop.round = 3) -bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, - maximize = FALSE, early.stop.round = 3) From 755eab89495ac3803b93806e618de5918f4070e8 Mon Sep 17 00:00:00 2001 From: hetong Date: Mon, 11 May 2015 20:58:41 -0700 Subject: [PATCH 29/60] update date --- R-package/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 0a183cbe7..c6975af5e 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -2,7 +2,7 @@ Package: xgboost Type: Package Title: eXtreme Gradient Boosting Version: 0.4-0 -Date: 2014-12-28 +Date: 2015-05-11 Author: Tianqi Chen , Tong He , Michael Benesty Maintainer: Tong He Description: Xgboost is short for eXtreme Gradient Boosting, which is an From 42bf52f46289300aa15595631f41d049ccca24cb Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 11 May 2015 23:42:49 -0700 Subject: [PATCH 30/60] 0.4 --- CHANGES.md | 2 +- README.md | 18 ++++-------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 6a4fabe9f..90fd77ebb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,7 +21,7 @@ xgboost-0.3 * Add [Code Guide](src/README.md) for customizing objective function and evaluation * Add R module -in progress 0.4 +xgboost-0.4 ===== * Distributed version of xgboost that runs on YARN, scales to billions of examples * Direct save/load data and model from/to S3 and HDFS diff --git a/README.md b/README.md index 333bd4fa7..d6517c354 100644 --- a/README.md +++ b/README.md @@ -22,18 +22,10 @@ Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) What's New ========== +* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md) * XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) -* XGBoost now support HDFS and S3 -* [Distributed XGBoost now runs on YARN](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) -* [xgboost user group](https://groups.google.com/forum/#!forum/xgboost-user/) for tracking changes, sharing your experience on xgboost -* New features in the lastest changes :) - - Distributed version that scale xgboost to even larger problems with cluster - - Feature importance visualization in R module, thanks to Michael Benesty - - Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py) -* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D) -* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html) Features ======== @@ -55,11 +47,9 @@ Build Version ======= -* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility -* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure - - This means the model and buffer file of previous version can not be loaded in xgboost-3.0 -* For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22) -* Change log in [CHANGES.md](CHANGES.md) +* Current version xgboost-0.4, a lot improvment has been made since 0.3 + - Change log in [CHANGES.md](CHANGES.md) + - This version is compatible with 0.3x versions XGBoost in Graphlab Create ========================== From cb4d7f821fcff2c13c1f6b0221f071ed3ae4e427 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 11 May 2015 23:44:02 -0700 Subject: [PATCH 31/60] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d6517c354..6082330ec 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) What's New ========== -* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md) +* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) * XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) From 62801f53431cbedc3d747e5ce867fcd25cc07c69 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 12 May 2015 20:20:30 -0700 Subject: [PATCH 32/60] allow fpic --- Makefile | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index e426b797d..140537879 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ export CC = gcc export CXX = g++ export MPICXX = mpicxx export LDFLAGS= -pthread -lm -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas ifeq ($(OS), Windows_NT) export CXX = g++ -m64 @@ -18,7 +18,6 @@ endif # by default use c++11 ifeq ($(cxx11),1) CFLAGS += -std=c++11 -else endif # handling dmlc @@ -38,6 +37,14 @@ else LIBDMLC=dmlc_simple.o endif +ifndef WITH_FPIC + WITH_FPIC = 1 +endif +ifeq ($(WITH_FPIC), 1) + CFLAGS += -fPIC +endif + + ifeq ($(OS), Windows_NT) LIBRABIT = subtree/rabit/lib/librabit_empty.a SLIB = wrapper/xgboost_wrapper.dll @@ -51,11 +58,15 @@ BIN = xgboost MOCKBIN = xgboost.mock OBJ = updater.o gbm.o io.o main.o dmlc_simple.o MPIBIN = -TARGET = $(BIN) $(OBJ) $(SLIB) +ifeq ($(WITH_FPIC), 1) + TARGET = $(BIN) $(OBJ) $(SLIB) +else + TARGET = $(BIN) +endif .PHONY: clean all mpi python Rpack -all: $(BIN) $(OBJ) $(SLIB) +all: $(TARGET) mpi: $(MPIBIN) python: wrapper/libxgboostwrapper.so @@ -79,7 +90,7 @@ subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc + cd subtree/rabit;make lib/librabit_mpi.a; cd ../.. $(BIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) + $(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) $(MOCKBIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) From a4341f22a2b45a59bf03df2b2efae67c8c4195f6 Mon Sep 17 00:00:00 2001 From: by321 Date: Wed, 13 May 2015 21:51:05 -0700 Subject: [PATCH 33/60] xgb.csv(printEveryN) parameter to print every n-th progress message --- R-package/R/xgb.cv.R | 8 ++++++-- R-package/man/xgb.cv.Rd | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index cc0e42a93..c290fba8b 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -54,6 +54,7 @@ #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). #' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process +#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. @@ -93,7 +94,7 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") @@ -161,6 +162,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = else predictValues <- rep(0,xgb.numrow(dtrain)) history <- c() + printEveryN = max(as.integer(printEveryN), 1L) for (i in 1:nrounds) { msg <- list() for (k in 1:nfold) { @@ -185,7 +187,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = } ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) - if(verbose) paste(ret, "\n", sep="") %>% cat + if(verbose) + if (0==(i-1L)%%printEveryN) + cat(ret, "\n", sep="") # early_Stopping if (!is.null(early_stop_round)){ diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index a2cd68c92..389cadef0 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -7,7 +7,7 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = T, early_stop_round = NULL, early.stop.round = NULL, + verbose = T, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) } \arguments{ @@ -66,6 +66,8 @@ If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process} +\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} + \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} From 49ad6335307f2c525160459be85cb7bcfbea1f67 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 13 May 2015 23:15:19 -0700 Subject: [PATCH 34/60] Update xgboost.py --- wrapper/xgboost.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 25ff8b1b2..5b38d2f2f 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -694,11 +694,11 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) bst.best_score = best_score bst.best_iteration = best_score_i - return bst - + break + bst.best_score = best_score + bst.best_iteration = best_score_i return bst - class CVPack(object): def __init__(self, dtrain, dtest, param): self.dtrain = dtrain From 3a7808dc7de97db6a972cb4a60df68747d9e954f Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 13 May 2015 23:34:09 -0700 Subject: [PATCH 35/60] remove print --- src/utils/quantile.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/utils/quantile.h b/src/utils/quantile.h index 2c0f7f000..4e885e254 100644 --- a/src/utils/quantile.h +++ b/src/utils/quantile.h @@ -328,14 +328,14 @@ struct WXQSummary : public WQSummary { } if (nbig >= n - 1) { // see what was the case - fprintf(stderr, "LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n); - fprintf(stderr, "LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n", - src.size, maxsize, static_cast(range), - static_cast(chunk)); + utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n); + utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n", + src.size, maxsize, static_cast(range), + static_cast(chunk)); for (size_t i = 0; i < src.size; ++i) { - printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i, - src.data[i].rmin, src.data[i].rmax, src.data[i].wmin, - src.data[i].value, CheckLarge(src.data[i], chunk)); + utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i, + src.data[i].rmin, src.data[i].rmax, src.data[i].wmin, + src.data[i].value, CheckLarge(src.data[i], chunk)); } utils::Assert(nbig < n - 1, "quantile: too many large chunk"); } From e080c663a810f6a847d548d797a1be6880f3cf72 Mon Sep 17 00:00:00 2001 From: Alex Chao Date: Thu, 14 May 2015 11:57:50 -0700 Subject: [PATCH 36/60] Updated grammar for the README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6082330ec..7d3702666 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ XGBoost: eXtreme Gradient Boosting ================================== An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. -It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data +It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data Contributors: https://github.com/dmlc/xgboost/graphs/contributors From b63868327f90c26353bdd293c9183a303c684436 Mon Sep 17 00:00:00 2001 From: Eugene Nizhibitsky Date: Thu, 14 May 2015 22:55:49 +0300 Subject: [PATCH 37/60] Fix early stopping in python wrapper --- wrapper/xgboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 5b38d2f2f..7e3fa2dc4 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -657,7 +657,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize_score = False if 'eval_metric' in params: maximize_metrics = ('auc', 'map', 'ndcg') - if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics): + if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)): maximize_score = True if maximize_score: From 4e080928a8b6d62e57f5a08e7637790f26b76ace Mon Sep 17 00:00:00 2001 From: John Wittenauer Date: Fri, 15 May 2015 21:19:39 -0400 Subject: [PATCH 38/60] Added classes_ attribute to scikit-learn wrapper. --- wrapper/xgboost.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 7e3fa2dc4..46a229cd6 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -935,8 +935,8 @@ class XGBClassifier(XGBModel, XGBClassifier): base_score, seed) def fit(self, X, y, sample_weight=None): - y_values = list(np.unique(y)) - self.n_classes_ = len(y_values) + self.classes_ = list(np.unique(y)) + self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: # Switch to using a multiclass objective in the underlying XGB instance self.objective = "multi:softprob" From 792cff5abc325ee841607ab41a04300e1040ec55 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 15 May 2015 23:54:03 -0700 Subject: [PATCH 39/60] checkin some micro optimization --- Makefile | 2 +- src/tree/param.h | 18 ++++- src/tree/updater_colmaker-inl.hpp | 103 ++++++++++++++++++++++++++++- src/tree/updater_histmaker-inl.hpp | 84 ++++++++++++++++++++--- 4 files changed, 193 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 140537879..e568222c2 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ export CC = gcc export CXX = g++ export MPICXX = mpicxx export LDFLAGS= -pthread -lm -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops ifeq ($(OS), Windows_NT) export CXX = g++ -m64 diff --git a/src/tree/param.h b/src/tree/param.h index 3458a93a4..118540119 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -48,6 +48,8 @@ struct TrainParam{ int size_leaf_vector; // option for parallelization int parallel_option; + // option to open cacheline optimizaton + int cache_opt; // number of threads to be used for tree construction, // if OpenMP is enabled, if equals 0, use system default int nthread; @@ -70,6 +72,7 @@ struct TrainParam{ parallel_option = 2; sketch_eps = 0.1f; sketch_ratio = 2.0f; + cache_opt = 0; } /*! * \brief set parameters from outside @@ -96,6 +99,7 @@ struct TrainParam{ if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast(atof(val)); if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast(atof(val)); if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val); + if (!strcmp(name, "cache_opt")) cache_opt = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val); if (!strcmp(name, "parallel_option")) parallel_option = atoi(val); @@ -192,6 +196,11 @@ struct GradStats { double sum_grad; /*! \brief sum hessian statistics */ double sum_hess; + /*! + * \brief whether this is simply statistics and we only need to call + * Add(gpair), instead of Add(gpair, info, ridx) + */ + static const int kSimpleStats = 1; /*! \brief constructor, the object must be cleared during construction */ explicit GradStats(const TrainParam ¶m) { this->Clear(); @@ -204,7 +213,14 @@ struct GradStats { inline static void CheckInfo(const BoosterInfo &info) { } /*! - * \brief accumulate statistics, + * \brief accumulate statistics + * \param p the gradient pair + */ + inline void Add(bst_gpair p) { + this->Add(p.grad, p.hess); + } + /*! + * \brief accumulate statistics, more complicated version * \param gpair the vector storing the gradient statistics * \param info the additional information * \param ridx instance index of this instance diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index f657c0335..b52842a93 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -356,7 +356,100 @@ class ColMaker: public IUpdater { } } } - } + } + // update enumeration solution + inline void UpdateEnumeration(int nid, bst_gpair gstats, + float fvalue, int d_step, bst_uint fid, + TStats &c, std::vector &temp) { + // get the statistics of nid + ThreadEntry &e = temp[nid]; + // test if first hit, this is fine, because we set 0 during init + if (e.stats.Empty()) { + e.stats.Add(gstats); + e.last_fvalue = fvalue; + } else { + // try to find a split + if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { + c.SetSubstract(snode[nid].stats, e.stats); + if (c.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1); + } + } + // update the statistics + e.stats.Add(gstats); + e.last_fvalue = fvalue; + } + } + // same as EnumerateSplit, with cacheline prefetch optimization + inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin, + const ColBatch::Entry *end, + int d_step, + bst_uint fid, + const std::vector &gpair, + std::vector &temp) { + const std::vector &qexpand = qexpand_; + // clear all the temp statistics + for (size_t j = 0; j < qexpand.size(); ++j) { + temp[qexpand[j]].stats.Clear(); + } + // left statistics + TStats c(param); + // local cache buffer for position and gradient pair + const int kBuffer = 32; + int buf_position[kBuffer]; + bst_gpair buf_gpair[kBuffer]; + // aligned ending position + const ColBatch::Entry *align_end; + if (d_step > 0) { + align_end = begin + (end - begin) / kBuffer * kBuffer; + } else { + align_end = begin - (begin - end) / kBuffer * kBuffer; + } + int i; + const ColBatch::Entry *it; + const int align_step = d_step * kBuffer; + // internal cached loop + for (it = begin; it != align_end; it += align_step) { + const ColBatch::Entry *p; + for (i = 0, p = it; i < kBuffer; ++i, p += d_step) { + buf_position[i] = position[p->index]; + buf_gpair[i] = gpair[p->index]; + } + for (i = 0, p = it; i < kBuffer; ++i, p += d_step) { + const int nid = buf_position[i]; + if (nid < 0) continue; + this->UpdateEnumeration(nid, buf_gpair[i], + p->fvalue, d_step, + fid, c, temp); + } + } + // finish up the ending piece + for (it = align_end, i = 0; it != end; ++i, it += d_step) { + buf_position[i] = position[it->index]; + buf_gpair[i] = gpair[it->index]; + } + for (it = align_end, i = 0; it != end; ++i, it += d_step) { + const int nid = buf_position[i]; + if (nid < 0) continue; + this->UpdateEnumeration(nid, buf_gpair[i], + it->fvalue, d_step, + fid, c, temp); + } + // finish updating all statistics, check if it is possible to include all sum statistics + for (size_t i = 0; i < qexpand.size(); ++i) { + const int nid = qexpand[i]; + ThreadEntry &e = temp[nid]; + c.SetSubstract(snode[nid].stats, e.stats); + if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + const float gap = std::abs(e.last_fvalue) + rt_eps; + const float delta = d_step == +1 ? gap: -gap; + e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1); + } + } + } + // enumerate the split values of specific feature inline void EnumerateSplit(const ColBatch::Entry *begin, const ColBatch::Entry *end, @@ -365,6 +458,11 @@ class ColMaker: public IUpdater { const std::vector &gpair, const BoosterInfo &info, std::vector &temp) { + // use cacheline aware optimization + if (TStats::kSimpleStats != 0 && param.cache_opt != 0) { + EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp); + return; + } const std::vector &qexpand = qexpand_; // clear all the temp statistics for (size_t j = 0; j < qexpand.size(); ++j) { @@ -411,6 +509,7 @@ class ColMaker: public IUpdater { } } } + // update the solution candidate virtual void UpdateSolution(const ColBatch &batch, const std::vector &gpair, @@ -550,8 +649,8 @@ class ColMaker: public IUpdater { #pragma omp parallel for schedule(static) for (bst_omp_uint j = 0; j < ndata; ++j) { const bst_uint ridx = col[j].index; - const float fvalue = col[j].fvalue; const int nid = this->DecodePosition(ridx); + const float fvalue = col[j].fvalue; // go back to parent, correct those who are not default if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) { if(fvalue < tree[nid].split_cond()) { diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index d6279592f..f739f23f3 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker { utils::Assert(istart != hist.size, "the bound variable must be max"); hist.data[istart].Add(gpair, info, ridx); } + /*! + * \brief add a histogram to data, + * do linear scan, start from istart + */ + inline void Add(bst_float fv, + bst_gpair gstats) { + while (istart < hist.size && !(fv < hist.cut[istart])) ++istart; + utils::Assert(istart != hist.size, "the bound variable must be max"); + hist.data[istart].Add(gstats); + } }; // sketch type used for this typedef utils::WXQuantileSketch WXQSketch; @@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker { hbuilder[nid].istart = 0; hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)]; } - for (bst_uint j = 0; j < c.length; ++j) { - const bst_uint ridx = c[j].index; - const int nid = this->position[ridx]; - if (nid >= 0) { - hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx); + if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) { + const bst_uint kBuffer = 32; + bst_uint align_length = c.length / kBuffer * kBuffer; + int buf_position[kBuffer]; + bst_gpair buf_gpair[kBuffer]; + for (bst_uint j = 0; j < align_length; j += kBuffer) { + for (bst_uint i = 0; i < kBuffer; ++i) { + bst_uint ridx = c[j + i].index; + buf_position[i] = this->position[ridx]; + buf_gpair[i] = gpair[ridx]; + } + for (bst_uint i = 0; i < kBuffer; ++i) { + const int nid = buf_position[i]; + if (nid >= 0) { + hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]); + } + } + } + for (bst_uint j = align_length; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + hbuilder[nid].Add(c[j].fvalue, gpair[ridx]); + } + } + } else { + for (bst_uint j = 0; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx); + } } } } @@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker { sbuilder[nid].Init(max_size); } // second pass, build the sketch - for (bst_uint j = 0; j < c.length; ++j) { - const bst_uint ridx = c[j].index; - const int nid = this->position[ridx]; - if (nid >= 0) { - sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) { + const bst_uint kBuffer = 32; + bst_uint align_length = c.length / kBuffer * kBuffer; + int buf_position[kBuffer]; + bst_float buf_hess[kBuffer]; + for (bst_uint j = 0; j < align_length; j += kBuffer) { + for (bst_uint i = 0; i < kBuffer; ++i) { + bst_uint ridx = c[j + i].index; + buf_position[i] = this->position[ridx]; + buf_hess[i] = gpair[ridx].hess; + } + for (bst_uint i = 0; i < kBuffer; ++i) { + const int nid = buf_position[i]; + if (nid >= 0) { + sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size); + } + } + } + for (bst_uint j = align_length; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + } + } + } else { + for (bst_uint j = 0; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + } } } for (size_t i = 0; i < this->qexpand.size(); ++i) { From 09a841f810f19593059bf7a658c40fb2c3743bd0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 15 May 2015 23:54:34 -0700 Subject: [PATCH 40/60] auto turn on optimization --- src/tree/param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/param.h b/src/tree/param.h index 118540119..1bffcb32c 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -72,7 +72,7 @@ struct TrainParam{ parallel_option = 2; sketch_eps = 0.1f; sketch_ratio = 2.0f; - cache_opt = 0; + cache_opt = 1; } /*! * \brief set parameters from outside From e6b8b23a2c84d38fbef753aea11e8a2d3b6d7260 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 16 May 2015 12:59:55 -0700 Subject: [PATCH 41/60] allow booster to be pickable, add copy function --- demo/guide-python/basic_walkthrough.py | 27 ++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py index ba8a4319f..cdff65c33 100755 --- a/demo/guide-python/basic_walkthrough.py +++ b/demo/guide-python/basic_walkthrough.py @@ -1,7 +1,9 @@ #!/usr/bin/python import numpy as np import scipy.sparse +import pickle import xgboost as xgb +import copy ### simple example # load file from text file, also binary buffer generated by xgboost @@ -19,7 +21,7 @@ bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest) labels = dtest.get_label() -print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) +print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) bst.save_model('0001.model') # dump model bst.dump_model('dump.raw.txt') @@ -28,6 +30,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt') # save dmatrix into binary buffer dtest.save_binary('dtest.buffer') +# save model bst.save_model('xgb.model') # load model and data in bst2 = xgb.Booster(model_file='xgb.model') @@ -36,6 +39,14 @@ preds2 = bst2.predict(dtest2) # assert they are the same assert np.sum(np.abs(preds2-preds)) == 0 +# alternatively, you can pickle the booster +pks = pickle.dumps(bst2) +# load model and data in +bst3 = pickle.loads(pks) +preds3 = bst2.predict(dtest2) +# assert they are the same +assert np.sum(np.abs(preds3-preds)) == 0 + ### # build dmatrix from scipy.sparse print ('start running example of build DMatrix from scipy.sparse CSR Matrix') @@ -44,22 +55,22 @@ row = []; col = []; dat = [] i = 0 for l in open('../data/agaricus.txt.train'): arr = l.split() - labels.append( int(arr[0])) + labels.append(int(arr[0])) for it in arr[1:]: k,v = it.split(':') row.append(i); col.append(int(k)); dat.append(float(v)) i += 1 -csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) -dtrain = xgb.DMatrix( csr, label = labels ) +csr = scipy.sparse.csr_matrix((dat, (row,col))) +dtrain = xgb.DMatrix(csr, label = labels) watchlist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, watchlist ) +bst = xgb.train(param, dtrain, num_round, watchlist) print ('start running example of build DMatrix from scipy.sparse CSC Matrix') # we can also construct from csc matrix -csc = scipy.sparse.csc_matrix( (dat, (row,col)) ) +csc = scipy.sparse.csc_matrix((dat, (row,col))) dtrain = xgb.DMatrix(csc, label=labels) watchlist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, watchlist ) +bst = xgb.train(param, dtrain, num_round, watchlist) print ('start running example of build DMatrix from numpy array') # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation @@ -67,6 +78,6 @@ print ('start running example of build DMatrix from numpy array') npymat = csr.todense() dtrain = xgb.DMatrix(npymat, label = labels) watchlist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, watchlist ) +bst = xgb.train(param, dtrain, num_round, watchlist) From 91a539092909e3ad005b36713573fb8fee2424aa Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 17 May 2015 21:29:51 -0700 Subject: [PATCH 42/60] checkin copy --- demo/guide-python/basic_walkthrough.py | 1 - wrapper/xgboost.py | 61 +++++++++++++++++++++----- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py index cdff65c33..5bfa55935 100755 --- a/demo/guide-python/basic_walkthrough.py +++ b/demo/guide-python/basic_walkthrough.py @@ -3,7 +3,6 @@ import numpy as np import scipy.sparse import pickle import xgboost as xgb -import copy ### simple example # load file from text file, also binary buffer generated by xgboost diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 46a229cd6..8d9c82b80 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -127,7 +127,6 @@ class DMatrix(object): weight : list or numpy 1-D array (optional) Weight for each instance. """ - # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None @@ -348,6 +347,46 @@ class Booster(object): def __del__(self): xglib.XGBoosterFree(self.handle) + def __getstate__(self): + # can't pickle ctypes pointers + # put model content in bytearray + this = self.__dict__.copy() + handle = this['handle'] + if handle is not None: + raw = self.save_raw() + this["handle"] = raw + return this + + def __setstate__(self, state): + # reconstruct handle from raw data + handle = state['handle'] + if handle is not None: + buf = handle + dmats = c_array(ctypes.c_void_p, []) + handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0)) + length = ctypes.c_ulong(len(buf)) + ptr = (ctypes.c_char * len(buf)).from_buffer(buf) + xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length) + state['handle'] = handle + self.__dict__.update(state) + self.set_param({'seed': 0}) + + def __copy__(self): + return self.__deepcopy__() + + def __deepcopy__(self): + return Booster(model_file = self.save_raw()) + + def copy(self): + """ + Copy the booster object + + Returns + -------- + a copied booster model + """ + return self.__copy__() + def set_param(self, params, pv=None): if isinstance(params, collections.Mapping): params = params.items() @@ -440,6 +479,11 @@ class Booster(object): """ Predict with data. + NOTE: This function is not thread safe. + For each booster object, predict can only be called from one thread. + If you want to run prediction using multiple thread, call bst.copy() to make copies + of model object and then call predict + Parameters ---------- data : DMatrix @@ -874,18 +918,12 @@ class XGBModel(XGBModelBase): self._Booster = None - def __getstate__(self): - # can't pickle ctypes pointers so put _Booster in a bytearray object - this = self.__dict__.copy() # don't modify in place - bst = this["_Booster"] - if bst is not None: - raw = this["_Booster"].save_raw() - this["_Booster"] = raw - return this - def __setstate__(self, state): + # backward compatiblity code + # load booster from raw if it is raw + # the booster now support pickle bst = state["_Booster"] - if bst is not None: + if bst is not None and not isinstance(bst, Booster): state["_Booster"] = Booster(model_file=bst) self.__dict__.update(state) @@ -977,7 +1015,6 @@ class XGBClassifier(XGBModel, XGBClassifier): classzero_probs = 1.0 - classone_probs return np.vstack((classzero_probs, classone_probs)).transpose() - class XGBRegressor(XGBModel, XGBRegressor): __doc__ = """ Implementation of the scikit-learn API for XGBoost regression From 0a0a80ec72fc50a85563909557515e77bfa8714a Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Thu, 14 May 2015 14:08:03 -0500 Subject: [PATCH 43/60] ENH: Allow settable missing value in sklearn api. --- wrapper/xgboost.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 5b38d2f2f..6ab856a6e 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -93,6 +93,7 @@ def ctypes2numpy(cptr, length, dtype): raise RuntimeError('memmove failed') return res + def ctypes2buffer(cptr, length): if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): raise RuntimeError('expected char pointer') @@ -102,6 +103,7 @@ def ctypes2buffer(cptr, length): raise RuntimeError('memmove failed') return res + def c_str(string): return ctypes.c_char_p(string.encode('utf-8')) @@ -850,10 +852,13 @@ class XGBModel(XGBModelBase): The initial prediction score of all instances, global bias. seed : int Random number seed. + missing : float, optional + Value in the data which needs to be present as a missing value. If + None, defaults to np.nan. """ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear", nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, - base_score=0.5, seed=0): + base_score=0.5, seed=0, missing=None): if not SKLEARN_INSTALLED: raise XGBError('sklearn needs to be installed in order to use this module') self.max_depth = max_depth @@ -871,6 +876,7 @@ class XGBModel(XGBModelBase): self.base_score = base_score self.seed = seed + self.missing = missing or np.nan self._Booster = None @@ -902,6 +908,12 @@ class XGBModel(XGBModelBase): raise XGBError('need to call fit beforehand') return self._Booster + def get_params(self, deep=False): + params = super(XGBModel, self).get_params(deep=deep) + if params['missing'] is np.nan: + params['missing'] = None # sklearn doesn't handle nan. see #4725 + return params + def get_xgb_params(self): xgb_params = self.get_params() @@ -912,12 +924,12 @@ class XGBModel(XGBModelBase): return xgb_params def fit(self, X, y): - trainDmatrix = DMatrix(X, label=y) + trainDmatrix = DMatrix(X, label=y, missing=self.missing) self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators) return self def predict(self, X): - testDmatrix = DMatrix(X) + testDmatrix = DMatrix(X, missing=self.missing) return self.booster().predict(testDmatrix) @@ -928,11 +940,11 @@ class XGBClassifier(XGBModel, XGBClassifier): def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, - base_score=0.5, seed=0): + base_score=0.5, seed=0, missing=None): super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, - base_score, seed) + base_score, seed, missing) def fit(self, X, y, sample_weight=None): y_values = list(np.unique(y)) @@ -949,16 +961,18 @@ class XGBClassifier(XGBModel, XGBClassifier): training_labels = self._le.transform(y) if sample_weight is not None: - trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight) + trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight, + missing=self.missing) else: - trainDmatrix = DMatrix(X, label=training_labels) + trainDmatrix = DMatrix(X, label=training_labels, + missing=self.missing) self._Booster = train(xgb_options, trainDmatrix, self.n_estimators) return self def predict(self, X): - testDmatrix = DMatrix(X) + testDmatrix = DMatrix(X, missing=self.missing) class_probs = self.booster().predict(testDmatrix) if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) @@ -968,7 +982,7 @@ class XGBClassifier(XGBModel, XGBClassifier): return self._le.inverse_transform(column_indexes) def predict_proba(self, X): - testDmatrix = DMatrix(X) + testDmatrix = DMatrix(X, missing=self.missing) class_probs = self.booster().predict(testDmatrix) if self.objective == "multi:softprob": return class_probs From a17cb2339e659aef1c167f2827aeef9ca393be66 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 18 May 2015 09:09:22 -0500 Subject: [PATCH 44/60] BUG: XGBError -> XGBoostError --- wrapper/xgboost.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 8d9c82b80..c1857ee91 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -363,24 +363,24 @@ class Booster(object): if handle is not None: buf = handle dmats = c_array(ctypes.c_void_p, []) - handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0)) + handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0)) length = ctypes.c_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length) - state['handle'] = handle + state['handle'] = handle self.__dict__.update(state) self.set_param({'seed': 0}) - + def __copy__(self): return self.__deepcopy__() def __deepcopy__(self): return Booster(model_file = self.save_raw()) - + def copy(self): """ Copy the booster object - + Returns -------- a copied booster model @@ -899,7 +899,7 @@ class XGBModel(XGBModelBase): nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, base_score=0.5, seed=0): if not SKLEARN_INSTALLED: - raise XGBError('sklearn needs to be installed in order to use this module') + raise XGBoostError('sklearn needs to be installed in order to use this module') self.max_depth = max_depth self.learning_rate = learning_rate self.n_estimators = n_estimators @@ -937,7 +937,7 @@ class XGBModel(XGBModelBase): booster : a xgboost booster of underlying model """ if self._Booster is None: - raise XGBError('need to call fit beforehand') + raise XGBoostError('need to call fit beforehand') return self._Booster def get_xgb_params(self): From 978216d35064df443a8a0083059f125b1785492f Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 18 May 2015 11:43:58 -0500 Subject: [PATCH 45/60] ENH: Allow missing = 0 --- wrapper/xgboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index fe6840034..bb61083c4 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -920,7 +920,7 @@ class XGBModel(XGBModelBase): self.base_score = base_score self.seed = seed - self.missing = missing or np.nan + self.missing = missing if missing is not None else np.nan self._Booster = None From b1c79323af4ed025054fdae1c278112b9dadb419 Mon Sep 17 00:00:00 2001 From: Daniel Saltiel Date: Wed, 20 May 2015 17:13:20 -0700 Subject: [PATCH 46/60] Update parameter.md to include parameter ranges only updated for tree booster parameters --- doc/parameter.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/parameter.md b/doc/parameter.md index 6b47b4bf9..469256f62 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -26,19 +26,26 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame #### Parameter for Tree Booster * eta [default=0.3] - - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. + - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. + - range: [0,1] * gamma [default=0] - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + - range: [0,∞] * max_depth [default=6] - maximum depth of a tree + - range: [2,∞] * min_child_weight [default=1] - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. + - range: [1,∞] * max_delta_step [default=0] - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update + - range: [0,∞] * subsample [default=1] - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. + - range: (0,1] * colsample_bytree [default=1] - subsample ratio of columns when constructing each tree. + - range: (0,1] #### Parameter for Linear Booster * lambda [default=0] From a31aaa410c5da0c9722d5861877643359304ec7d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 20 May 2015 17:27:15 -0700 Subject: [PATCH 47/60] Update parameter.md --- doc/parameter.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/parameter.md b/doc/parameter.md index 469256f62..13eefa0fe 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -33,10 +33,10 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame - range: [0,∞] * max_depth [default=6] - maximum depth of a tree - - range: [2,∞] + - range: [1,∞] * min_child_weight [default=1] - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - - range: [1,∞] + - range: [0,∞] * max_delta_step [default=0] - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update - range: [0,∞] From 7d132aefa98f98a127c6c41ac71fc0299ddfd4ac Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 21 May 2015 13:01:15 -0700 Subject: [PATCH 48/60] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index b9f38c38a..3be067aed 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014 by Tianqi Chen and Contributors +Copyright (c) 2014 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From bc7241b2a4c1a1a05c83e3e8000871fe1f3090a5 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 21 May 2015 13:44:21 -0700 Subject: [PATCH 49/60] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7d3702666..415bf771b 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ Distributed Version: [Distributed XGBoost](multi-node) Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) +XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects + What's New ========== * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) From 1d57cfb7bd41cea56b4d2fdc03cb969e80581e8c Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 22 May 2015 13:27:08 -0700 Subject: [PATCH 50/60] Update xgboost.py --- wrapper/xgboost.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index bb61083c4..0280d87b3 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -113,7 +113,7 @@ def c_array(ctype, values): class DMatrix(object): - def __init__(self, data, label=None, missing=0.0, weight=None): + def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ Data matrix used in XGBoost. @@ -128,13 +128,15 @@ class DMatrix(object): Value in the data which needs to be present as a missing value. weight : list or numpy 1-D array (optional) Weight for each instance. + silent: boolean + Whether print messages during construction """ # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None return if isinstance(data, string_types): - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0)) + self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent))) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) elif isinstance(data, scipy.sparse.csc_matrix): From 458585b5fdb61787e9e33d4bcdc97b2ab87dde97 Mon Sep 17 00:00:00 2001 From: Tong He Date: Mon, 25 May 2015 10:24:59 -0700 Subject: [PATCH 51/60] Update xgb.train.R --- R-package/R/xgb.train.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 6b2ace886..99eee2b0f 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -137,7 +137,6 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } if (length(watchlist) != 0 && verbose == 0) { warning('watchlist is provided but verbose=0, no evaluation information will be printed') - watchlist <- list() } params = append(params, list(...)) From 19b24cf978763701b41a928679e254743c18b629 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 25 May 2015 11:19:38 -0700 Subject: [PATCH 52/60] customized obj and feval interface --- R-package/R/xgb.cv.R | 16 ++++++++++++++++ R-package/R/xgb.train.R | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index c290fba8b..f2dd00e89 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -119,6 +119,22 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- append(params, list("eval_metric"=mc)) } + # customized objective and evaluation metric interface + if (!is.null(params$objective) && !is.null(obj)) + stop("xgb.cv: cannot assign two different objectives") + if (!is.null(params$objective)) + if (class(params$objective)=='function') { + obj = params$objective + params$objective = NULL + } + if (!is.null(params$eval_metric) && !is.null(feval)) + stop("xgb.cv: cannot assign two different evaluation metrics") + if (!is.null(params$eval_metric)) + if (class(params$eval_metric)=='function') { + feval = params$eval_metric + params$eval_metric = NULL + } + # Early Stopping if (is.null(early_stop_round) && !is.null(early.stop.round)) early_stop_round = early.stop.round diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 99eee2b0f..795a62eb1 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -140,6 +140,22 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } params = append(params, list(...)) + # customized objective and evaluation metric interface + if (!is.null(params$objective) && !is.null(obj)) + stop("xgb.train: cannot assign two different objectives") + if (!is.null(params$objective)) + if (class(params$objective)=='function') { + obj = params$objective + params$objective = NULL + } + if (!is.null(params$eval_metric) && !is.null(feval)) + stop("xgb.train: cannot assign two different evaluation metrics") + if (!is.null(params$eval_metric)) + if (class(params$eval_metric)=='function') { + feval = params$eval_metric + params$eval_metric = NULL + } + # Early stopping if (is.null(early_stop_round) && !is.null(early.stop.round)) early_stop_round = early.stop.round From 8d3a7e1688f075779f7ba96ec6d68cba23a75619 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 25 May 2015 11:30:04 -0700 Subject: [PATCH 53/60] change doc and demo for new obj feval interface --- R-package/R/xgb.train.R | 8 ++++---- R-package/demo/custom_objective.R | 6 ++++-- R-package/man/xgb.cv.Rd | 4 ++-- R-package/man/xgb.train.Rd | 10 +++++----- R-package/man/xgboost.Rd | 4 ++-- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 795a62eb1..099d8b4d0 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -36,7 +36,7 @@ #' 3. Task Parameters #' #' \itemize{ -#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: +#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: #' \itemize{ #' \item \code{reg:linear} linear regression (Default). #' \item \code{reg:logistic} logistic regression. @@ -48,7 +48,7 @@ #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 -#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. +#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. #' } #' #' @param data takes an \code{xgb.DMatrix} as the input. @@ -103,7 +103,6 @@ #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' dtest <- dtrain #' watchlist <- list(eval = dtest, train = dtrain) -#' param <- list(max.depth = 2, eta = 1, silent = 1) #' logregobj <- function(preds, dtrain) { #' labels <- getinfo(dtrain, "label") #' preds <- 1/(1 + exp(-preds)) @@ -116,7 +115,8 @@ #' err <- as.numeric(sum(labels != (preds > 0)))/length(labels) #' return(list(metric = "error", value = err)) #' } -#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror) +#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror) +#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) #' @export #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R index b0a0a02ca..cb90a7b5a 100644 --- a/R-package/demo/custom_objective.R +++ b/R-package/demo/custom_objective.R @@ -8,7 +8,6 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) # note: for customized objective function, we leave objective as default # note: what we are getting is margin value in prediction # you must know what you are doing -param <- list(max.depth=2,eta=1,nthread = 2, silent=1) watchlist <- list(eval = dtest, train = dtrain) num_round <- 2 @@ -33,10 +32,13 @@ evalerror <- function(preds, dtrain) { err <- as.numeric(sum(labels != (preds > 0)))/length(labels) return(list(metric = "error", value = err)) } + +param <- list(max.depth=2,eta=1,nthread = 2, silent=1, + objective=logregobj, eval_metric=evalerror) print ('start training with user customized objective') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) +bst <- xgb.train(param, dtrain, num_round, watchlist) # # there can be cases where you want additional information diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 389cadef0..f1fe97563 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -7,8 +7,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = T, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, - maximize = NULL, ...) + verbose = T, printEveryN = 1L, early_stop_round = NULL, + early.stop.round = NULL, maximize = NULL, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 74dca1c49..4d5d8d3e6 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -5,7 +5,7 @@ \title{eXtreme Gradient Boosting Training} \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, - feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL, + feval = NULL, verbose = 1, printEveryN = 1L, early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) } \arguments{ @@ -43,7 +43,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, 3. Task Parameters \itemize{ -\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: +\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: \itemize{ \item \code{reg:linear} linear regression (Default). \item \code{reg:logistic} logistic regression. @@ -55,7 +55,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. } \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 - \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. + \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. }} \item{data}{takes an \code{xgb.DMatrix} as the input.} @@ -122,7 +122,6 @@ data(agaricus.train, package='xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- dtrain watchlist <- list(eval = dtest, train = dtrain) -param <- list(max.depth = 2, eta = 1, silent = 1) logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") preds <- 1/(1 + exp(-preds)) @@ -135,6 +134,7 @@ evalerror <- function(preds, dtrain) { err <- as.numeric(sum(labels != (preds > 0)))/length(labels) return(list(metric = "error", value = err)) } -bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror) +param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror) +bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) } diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index bf9cb5c91..7371522fd 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -5,8 +5,8 @@ \title{eXtreme Gradient Boosting (Tree) library} \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, - maximize = NULL, ...) + nrounds, verbose = 1, printEveryN = 1L, early_stop_round = NULL, + early.stop.round = NULL, maximize = NULL, ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or From 733d23aef82738d3220cf1148c18c67565b515ab Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 25 May 2015 11:51:01 -0700 Subject: [PATCH 54/60] rename arguments to be dot-seperated --- R-package/R/xgb.cv.R | 23 ++++++++++------------- R-package/R/xgb.train.R | 26 +++++++++++--------------- R-package/R/xgboost.R | 12 +++++------- R-package/man/xgb.cv.Rd | 12 +++++------- R-package/man/xgb.train.Rd | 10 ++++------ R-package/man/xgboost.Rd | 12 +++++------- 6 files changed, 40 insertions(+), 55 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index f2dd00e89..031cfda37 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -54,12 +54,11 @@ #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). #' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process -#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. -#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. +#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. +#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. -#' @param early.stop.round An alternative of \code{early_stop_round}. -#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. #' #' @param ... other parameters to pass to \code{params}. @@ -94,8 +93,8 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, printEveryN=1L, - early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L, + early.stop.round = NULL, maximize = NULL, ...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -136,9 +135,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = } # Early Stopping - if (is.null(early_stop_round) && !is.null(early.stop.round)) - early_stop_round = early.stop.round - if (!is.null(early_stop_round)){ + if (!is.null(early.stop.round)){ if (!is.null(feval) && is.null(maximize)) stop('Please set maximize to note whether the model is maximizing the evaluation or not.') if (is.null(maximize) && is.null(params$eval_metric)) @@ -178,7 +175,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = else predictValues <- rep(0,xgb.numrow(dtrain)) history <- c() - printEveryN = max(as.integer(printEveryN), 1L) + print.every.n = max(as.integer(print.every.n), 1L) for (i in 1:nrounds) { msg <- list() for (k in 1:nfold) { @@ -204,11 +201,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) if(verbose) - if (0==(i-1L)%%printEveryN) + if (0==(i-1L)%%print.every.n) cat(ret, "\n", sep="") # early_Stopping - if (!is.null(early_stop_round)){ + if (!is.null(early.stop.round)){ score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1] score = strsplit(score,'\\+|:')[[1]][[2]] score = as.numeric(score) @@ -216,7 +213,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bestScore = score bestInd = i } else { - if (i-bestInd>=early_stop_round) { + if (i-bestInd>=early.stop.round) { earlyStopflag = TRUE cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 099d8b4d0..0700577f7 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -66,12 +66,11 @@ #' prediction and dtrain, #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both -#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. -#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. +#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. +#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. -#' @param early.stop.round An alternative of \code{early_stop_round}. -#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' @@ -120,9 +119,8 @@ #' @export #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), - obj = NULL, feval = NULL, verbose = 1, printEveryN=1L, - early_stop_round = NULL, early.stop.round = NULL, - maximize = NULL, ...) { + obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, + early.stop.round = NULL, maximize = NULL, ...) { dtrain <- data if (typeof(params) != "list") { stop("xgb.train: first argument params must be list") @@ -157,9 +155,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } # Early stopping - if (is.null(early_stop_round) && !is.null(early.stop.round)) - early_stop_round = early.stop.round - if (!is.null(early_stop_round)){ + if (!is.null(early.stop.round)){ if (!is.null(feval) && is.null(maximize)) stop('Please set maximize to note whether the model is maximizing the evaluation or not.') if (length(watchlist) == 0) @@ -190,14 +186,14 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) - printEveryN=max( as.integer(printEveryN), 1L) + print.every.n=max( as.integer(print.every.n), 1L) for (i in 1:nrounds) { succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) - if (0== ( (i-1) %% printEveryN)) + if (0== ( (i-1) %% print.every.n)) cat(paste(msg, "\n", sep="")) - if (!is.null(early_stop_round)) + if (!is.null(early.stop.round)) { score = strsplit(msg,':|\\s+')[[1]][3] score = as.numeric(score) @@ -205,7 +201,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestScore = score bestInd = i } else { - if (i-bestInd>=early_stop_round) { + if (i-bestInd>=early.stop.round) { earlyStopflag = TRUE cat('Stopping. Best iteration:',bestInd) break @@ -215,7 +211,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } } bst <- xgb.Booster.check(bst) - if (!is.null(early_stop_round)) { + if (!is.null(early.stop.round)) { bst$bestScore = bestScore bst$bestInd = bestInd } diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index f4aa5f142..63077f866 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -28,14 +28,13 @@ #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both #' performance and construction progress information -#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. +#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. -#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. +#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. -#' @param early.stop.round An alternative of \code{early_stop_round}. -#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. #' @param ... other parameters to pass to \code{params}. #' @@ -58,7 +57,7 @@ #' @export #' xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, - verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL, + verbose = 1, print.every.n = 1L, early.stop.round = NULL, maximize = NULL, ...) { if (is.null(missing)) { dtrain <- xgb.get.DMatrix(data, label) @@ -74,8 +73,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), watchlist <- list() } - bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, printEveryN=printEveryN, - early_stop_round = early_stop_round, + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n, early.stop.round = early.stop.round) return(bst) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index f1fe97563..bb23992a2 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -7,8 +7,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = T, printEveryN = 1L, early_stop_round = NULL, - early.stop.round = NULL, maximize = NULL, ...) + verbose = T, print.every.n = 1L, early.stop.round = NULL, + maximize = NULL, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -66,15 +66,13 @@ If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process} -\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} +\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} -\item{early.stop.round}{An alternative of \code{early_stop_round}.} - -\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 4d5d8d3e6..7b1893ba7 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -5,7 +5,7 @@ \title{eXtreme Gradient Boosting Training} \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, - feval = NULL, verbose = 1, printEveryN = 1L, early_stop_round = NULL, + feval = NULL, verbose = 1, print.every.n = 1L, early.stop.round = NULL, maximize = NULL, ...) } \arguments{ @@ -78,15 +78,13 @@ prediction and dtrain,} \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print information of performance. If 2, xgboost will print information of both} -\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} +\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} -\item{early.stop.round}{An alternative of \code{early_stop_round}.} - -\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 7371522fd..64bd00369 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -5,8 +5,8 @@ \title{eXtreme Gradient Boosting (Tree) library} \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, printEveryN = 1L, early_stop_round = NULL, - early.stop.round = NULL, maximize = NULL, ...) + nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, + maximize = NULL, ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or @@ -42,15 +42,13 @@ Commonly used ones are: information of performance. If 2, xgboost will print information of both performance and construction progress information} -\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} +\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} -\item{early.stop.round}{An alternative of \code{early_stop_round}.} - -\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} From a55f4d341671e337d8dfbab7edcd45eed3a38f94 Mon Sep 17 00:00:00 2001 From: Jonathan Robinson Date: Thu, 28 May 2015 09:45:46 -0400 Subject: [PATCH 55/60] Update xgboostPresentation.Rmd Edited to note unavailability of stable version of this package on CRAN. http://cran.r-project.org/web/packages/xgboost/index.html --- R-package/vignettes/xgboostPresentation.Rmd | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 0bab9a1f4..b7648340d 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -57,11 +57,9 @@ devtools::install_github('dmlc/xgboost', subdir='R-package') Cran version ------------ -For stable version on *CRAN*, run: +As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository. -```{r installCran, eval=FALSE} -install.packages('xgboost') -``` +Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost) Learning ======== From f9ae83e95159f7503903c4d309fe54a07fb01d5d Mon Sep 17 00:00:00 2001 From: Tong He Date: Thu, 28 May 2015 09:30:23 -0700 Subject: [PATCH 56/60] Update xgb.cv.R --- R-package/R/xgb.cv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 031cfda37..df7fd5648 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -206,7 +206,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # early_Stopping if (!is.null(early.stop.round)){ - score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1] + score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2] score = strsplit(score,'\\+|:')[[1]][[2]] score = as.numeric(score) if ((maximize && score>bestScore) || (!maximize && score Date: Sat, 30 May 2015 15:48:57 -0700 Subject: [PATCH 57/60] modify script to use objective and eval_metric --- R-package/demo/cross_validation.R | 6 +++--- R-package/demo/custom_objective.R | 3 ++- R-package/demo/early_stopping.R | 9 +++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index fbb38f6d8..c3148ae21 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -40,10 +40,10 @@ evalerror <- function(preds, dtrain) { return(list(metric = "error", value = err)) } -param <- list(max.depth=2,eta=1,silent=1) +param <- list(max.depth=2,eta=1,silent=1, + objective = logregobj, eval_metric = evalerror) # train with customized objective -xgb.cv(param, dtrain, nround, nfold = 5, - obj = logregobj, feval=evalerror) +xgb.cv(param, dtrain, nround, nfold = 5) # do cross validation with prediction values for each fold res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE) diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R index cb90a7b5a..201f23d98 100644 --- a/R-package/demo/custom_objective.R +++ b/R-package/demo/custom_objective.R @@ -61,4 +61,5 @@ logregobjattr <- function(preds, dtrain) { print ('start training with user customized objective, with additional attributes in DMatrix') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror) +bst <- xgb.train(param, dtrain, num_round, watchlist, + objective=logregobj, eval_metric=evalerror) diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R index 34dfebc0b..aa74aa2ee 100644 --- a/R-package/demo/early_stopping.R +++ b/R-package/demo/early_stopping.R @@ -31,9 +31,10 @@ evalerror <- function(preds, dtrain) { return(list(metric = "error", value = err)) } print ('start training with early Stopping setting') -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, + +bst <- xgb.train(param, dtrain, num_round, watchlist, + objective = logregobj, eval_metric = evalerror, maximize = FALSE, early.stop.round = 3) -bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, +bst <- xgb.cv(param, dtrain, num_round, nfold = 5, + objective = logregobj, eval_metric = evalerror, maximize = FALSE, early.stop.round = 3) From bc7f6b37b036589fa4635fef302cbde83904afa0 Mon Sep 17 00:00:00 2001 From: Tong He Date: Sat, 30 May 2015 17:39:19 -0700 Subject: [PATCH 58/60] Update README.md --- R-package/README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/R-package/README.md b/R-package/README.md index 7d2be411d..e974e3554 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -8,11 +8,6 @@ For up-to-date version (which is recommended), please install from github. Windo devtools::install_github('dmlc/xgboost',subdir='R-package') ``` -For stable version on CRAN, please run - -```r -install.packages('xgboost') -``` ## Examples From e5dd894960b6735336df90335d8f7da753b95553 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Jun 2015 11:38:06 -0700 Subject: [PATCH 59/60] add a indicator opt --- src/tree/param.h | 6 +++--- src/tree/updater_colmaker-inl.hpp | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/tree/param.h b/src/tree/param.h index 1bffcb32c..20ba1e6c0 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -155,12 +155,12 @@ struct TrainParam{ return dw; } /*! \brief whether need forward small to big search: default right */ - inline bool need_forward_search(float col_density = 0.0f) const { + inline bool need_forward_search(float col_density, bool indicator) const { return this->default_direction == 2 || - (default_direction == 0 && (col_density < opt_dense_col)); + (default_direction == 0 && (col_density < opt_dense_col) && !indicator); } /*! \brief whether need backward big to small search: default left */ - inline bool need_backward_search(float col_density = 0.0f) const { + inline bool need_backward_search(float col_density, bool indicator) const { return this->default_direction != 2; } /*! \brief given the loss change, whether we need to invode prunning */ diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index b52842a93..db3581aac 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -234,8 +234,9 @@ class ColMaker: public IUpdater { const IFMatrix &fmat, const std::vector &gpair, const BoosterInfo &info) { - bool need_forward = param.need_forward_search(fmat.GetColDensity(fid)); - bool need_backward = param.need_backward_search(fmat.GetColDensity(fid)); + const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue; + bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind); + bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind); const std::vector &qexpand = qexpand_; #pragma omp parallel { @@ -530,11 +531,12 @@ class ColMaker: public IUpdater { const bst_uint fid = batch.col_index[i]; const int tid = omp_get_thread_num(); const ColBatch::Inst c = batch[i]; - if (param.need_forward_search(fmat.GetColDensity(fid))) { + const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue; + if (param.need_forward_search(fmat.GetColDensity(fid), ind)) { this->EnumerateSplit(c.data, c.data + c.length, +1, fid, gpair, info, stemp[tid]); } - if (param.need_backward_search(fmat.GetColDensity(fid))) { + if (param.need_backward_search(fmat.GetColDensity(fid), ind)) { this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, fid, gpair, info, stemp[tid]); } From 2937f5eebcf7a7b6a873ab5576a5f5f66a9a71d4 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Jun 2015 23:18:31 -0700 Subject: [PATCH 60/60] io part refactor --- src/data.h | 6 +- src/io/page_dmatrix-inl.hpp | 11 +- src/io/page_fmatrix-inl.hpp | 16 +- src/io/simple_dmatrix-inl.hpp | 43 +++++- src/io/simple_fmatrix-inl.hpp | 279 +++++++++++++++++++++++----------- src/io/sparse_batch_page.h | 18 ++- src/learner/learner-inl.hpp | 16 +- 7 files changed, 276 insertions(+), 113 deletions(-) diff --git a/src/data.h b/src/data.h index d1f5eb427..63dd2d78f 100644 --- a/src/data.h +++ b/src/data.h @@ -140,8 +140,12 @@ class IFMatrix { * \brief check if column access is supported, if not, initialize column access * \param enabled whether certain feature should be included in column access * \param subsample subsample ratio when generating column access + * \param max_row_perbatch auxilary information, maximum row used in each column batch + * this is a hint information that can be ignored by the implementation */ - virtual void InitColAccess(const std::vector &enabled, float subsample) = 0; + virtual void InitColAccess(const std::vector &enabled, + float subsample, + size_t max_row_perbatch) = 0; // the following are column meta data, should be able to answer them fast /*! \return whether column access is enabled */ virtual bool HaveColAccess(void) const = 0; diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 8fb02e18e..79455d130 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -33,10 +33,7 @@ class ThreadRowPageIterator: public utils::IIterator { } virtual bool Next(void) { if (!itr.Next(page_)) return false; - out_.base_rowid = base_rowid_; - out_.ind_ptr = BeginPtr(page_->offset); - out_.data_ptr = BeginPtr(page_->data); - out_.size = page_->offset.size() - 1; + out_ = page_->GetRowBatch(base_rowid_); base_rowid_ += out_.size; return true; } @@ -198,8 +195,8 @@ class DMatrixPageBase : public DataMatrix { } /*! \brief magic number used to identify DMatrix */ static const int kMagic = TKMagic; - /*! \brief page size 64 MB */ - static const size_t kPageSize = 64UL << 20UL; + /*! \brief page size 32 MB */ + static const size_t kPageSize = 32UL << 20UL; protected: virtual void set_cache_file(const std::string &cache_file) = 0; @@ -236,7 +233,7 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> { class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> { public: DMatrixHalfRAM(void) { - fmat_ = new FMatrixS(iter_); + fmat_ = new FMatrixS(iter_, this->info); } virtual ~DMatrixHalfRAM(void) { delete fmat_; diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 7d4cdb9cf..18f4c6dee 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -58,11 +58,13 @@ struct ColConvertFactory { return true; } inline void Setup(float pkeep, + size_t max_row_perbatch, size_t num_col, utils::IIterator *iter, std::vector *buffered_rowset, const std::vector *enabled) { pkeep_ = pkeep; + max_row_perbatch_ = max_row_perbatch; num_col_ = num_col; iter_ = iter; buffered_rowset_ = buffered_rowset; @@ -87,7 +89,8 @@ struct ColConvertFactory { tmp_.Push(batch[i]); } } - if (tmp_.MemCostBytes() >= kPageSize) { + if (tmp_.MemCostBytes() >= kPageSize || + tmp_.Size() >= max_row_perbatch_) { this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop, *enabled_, val); return true; @@ -157,6 +160,8 @@ struct ColConvertFactory { } // probability of keep float pkeep_; + // maximum number of rows per batch + size_t max_row_perbatch_; // number of columns size_t num_col_; // row batch iterator @@ -208,10 +213,10 @@ class FMatrixPage : public IFMatrix { return 1.0f - (static_cast(nmiss)) / num_buffered_row_; } virtual void InitColAccess(const std::vector &enabled, - float pkeep = 1.0f) { + float pkeep, size_t max_row_perbatch) { if (this->HaveColAccess()) return; if (TryLoadColData()) return; - this->InitColData(enabled, pkeep); + this->InitColData(enabled, pkeep, max_row_perbatch); utils::Check(TryLoadColData(), "failed on creating col.blob"); } /*! @@ -282,7 +287,8 @@ class FMatrixPage : public IFMatrix { * \brief intialize column data * \param pkeep probability to keep a row */ - inline void InitColData(const std::vector &enabled, float pkeep) { + inline void InitColData(const std::vector &enabled, + float pkeep, size_t max_row_perbatch) { // clear rowset buffered_rowset_.clear(); col_size_.resize(info.num_col()); @@ -294,7 +300,7 @@ class FMatrixPage : public IFMatrix { size_t bytes_write = 0; utils::ThreadBuffer citer; citer.SetParam("buffer_size", "2"); - citer.get_factory().Setup(pkeep, info.num_col(), + citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(), iter_, &buffered_rowset_, &enabled); citer.Init(); SparsePage *pcol; diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 9b0addc1c..3876c21ad 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -28,7 +28,7 @@ class DMatrixSimple : public DataMatrix { public: // constructor DMatrixSimple(void) : DataMatrix(kMagic) { - fmat_ = new FMatrixS(new OneBatchIter(this)); + fmat_ = new FMatrixS(new OneBatchIter(this), this->info); this->Clear(); } // virtual destructor @@ -171,7 +171,7 @@ class DMatrixSimple : public DataMatrix { utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname); info.LoadBinary(fs); - FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); + LoadBinary(fs, &row_ptr_, &row_data_); fmat_->LoadColAccess(fs); if (!silent) { @@ -198,9 +198,8 @@ class DMatrixSimple : public DataMatrix { utils::FileStream fs(utils::FopenCheck(fname, "wb")); int tmagic = kMagic; fs.Write(&tmagic, sizeof(tmagic)); - info.SaveBinary(fs); - FMatrixS::SaveBinary(fs, row_ptr_, row_data_); + SaveBinary(fs, row_ptr_, row_data_); fmat_->SaveColAccess(fs); fs.Close(); @@ -251,6 +250,42 @@ class DMatrixSimple : public DataMatrix { static const int kMagic = 0xffffab01; protected: + /*! + * \brief save data to binary stream + * \param fo output stream + * \param ptr pointer data + * \param data data content + */ + inline static void SaveBinary(utils::IStream &fo, + const std::vector &ptr, + const std::vector &data) { + size_t nrow = ptr.size() - 1; + fo.Write(&nrow, sizeof(size_t)); + fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t)); + if (data.size() != 0) { + fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry)); + } + } + /*! + * \brief load data from binary stream + * \param fi input stream + * \param out_ptr pointer data + * \param out_data data content + */ + inline static void LoadBinary(utils::IStream &fi, + std::vector *out_ptr, + std::vector *out_data) { + size_t nrow; + utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); + out_ptr->resize(nrow + 1); + utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0, + "invalid input file format"); + out_data->resize(out_ptr->back()); + if (out_data->size() != 0) { + utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0, + "invalid input file format"); + } + } // one batch iterator that return content in the matrix struct OneBatchIter: utils::IIterator { explicit OneBatchIter(DMatrixSimple *parent) diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index acf85297f..fc6aab8f9 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -1,15 +1,18 @@ -#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP -#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP +#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ +#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ /*! * \file simple_fmatrix-inl.hpp * \brief the input data structure for gradient boosting * \author Tianqi Chen */ +#include #include "../data.h" #include "../utils/utils.h" #include "../utils/random.h" #include "../utils/omp.h" +#include "../learner/dmatrix.h" #include "../utils/group_data.h" +#include "./sparse_batch_page.h" namespace xgboost { namespace io { @@ -20,21 +23,23 @@ class FMatrixS : public IFMatrix { public: typedef SparseBatch::Entry Entry; /*! \brief constructor */ - FMatrixS(utils::IIterator *iter) { + FMatrixS(utils::IIterator *iter, + const learner::MetaInfo &info) + : info_(info) { this->iter_ = iter; } // destructor virtual ~FMatrixS(void) { - if (iter_ != NULL) delete iter_; + if (iter_ != NULL) delete iter_; } /*! \return whether column access is enabled */ virtual bool HaveColAccess(void) const { - return col_ptr_.size() != 0; + return col_size_.size() != 0; } /*! \brief get number of colmuns */ virtual size_t NumCol(void) const { utils::Check(this->HaveColAccess(), "NumCol:need column access"); - return col_ptr_.size() - 1; + return col_size_.size() - 1; } /*! \brief get number of buffered rows */ virtual const std::vector &buffered_rowset(void) const { @@ -42,17 +47,17 @@ class FMatrixS : public IFMatrix { } /*! \brief get column size */ virtual size_t GetColSize(size_t cidx) const { - return col_ptr_[cidx+1] - col_ptr_[cidx]; + return col_size_[cidx]; } /*! \brief get column density */ virtual float GetColDensity(size_t cidx) const { - size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); + size_t nmiss = buffered_rowset_.size() - col_size_[cidx]; return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); } virtual void InitColAccess(const std::vector &enabled, - float pkeep = 1.0f) { + float pkeep, size_t max_row_perbatch) { if (this->HaveColAccess()) return; - this->InitColData(pkeep, enabled); + this->InitColData(enabled, pkeep, max_row_perbatch); } /*! * \brief get the row iterator associated with FMatrix @@ -70,7 +75,7 @@ class FMatrixS : public IFMatrix { for (size_t i = 0; i < ncol; ++i) { col_iter_.col_index_[i] = static_cast(i); } - col_iter_.SetBatch(col_ptr_, col_data_); + col_iter_.BeforeFirst(); return &col_iter_; } /*! @@ -82,7 +87,7 @@ class FMatrixS : public IFMatrix { for (size_t i = 0; i < fset.size(); ++i) { if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); } - col_iter_.SetBatch(col_ptr_, col_data_); + col_iter_.BeforeFirst(); return &col_iter_; } /*! @@ -90,64 +95,52 @@ class FMatrixS : public IFMatrix { * \param fo output stream to save to */ inline void SaveColAccess(utils::IStream &fo) const { - fo.Write(buffered_rowset_); - if (buffered_rowset_.size() != 0) { - SaveBinary(fo, col_ptr_, col_data_); - } + size_t n = 0; + fo.Write(&n, sizeof(n)); } /*! * \brief load column access data from stream * \param fo output stream to load from */ inline void LoadColAccess(utils::IStream &fi) { - utils::Check(fi.Read(&buffered_rowset_), "invalid input file format"); - if (buffered_rowset_.size() != 0) { - LoadBinary(fi, &col_ptr_, &col_data_); - } + // do nothing in load col access } - /*! - * \brief save data to binary stream - * \param fo output stream - * \param ptr pointer data - * \param data data content - */ - inline static void SaveBinary(utils::IStream &fo, - const std::vector &ptr, - const std::vector &data) { - size_t nrow = ptr.size() - 1; - fo.Write(&nrow, sizeof(size_t)); - fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t)); - if (data.size() != 0) { - fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry)); - } - } - /*! - * \brief load data from binary stream - * \param fi input stream - * \param out_ptr pointer data - * \param out_data data content - */ - inline static void LoadBinary(utils::IStream &fi, - std::vector *out_ptr, - std::vector *out_data) { - size_t nrow; - utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); - out_ptr->resize(nrow + 1); - utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0, - "invalid input file format"); - out_data->resize(out_ptr->back()); - if (out_data->size() != 0) { - utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0, - "invalid input file format"); - } - } - + protected: /*! * \brief intialize column data + * \param enabled the list of enabled columns * \param pkeep probability to keep a row + * \param max_row_perbatch maximum row per batch */ - inline void InitColData(float pkeep, const std::vector &enabled) { + inline void InitColData(const std::vector &enabled, + float pkeep, size_t max_row_perbatch) { + col_iter_.Clear(); + if (info_.num_row() < max_row_perbatch) { + SparsePage *page = new SparsePage(); + this->MakeOneBatch(enabled, pkeep, page); + col_iter_.cpages_.push_back(page); + } else { + this->MakeManyBatch(enabled, pkeep, max_row_perbatch); + } + // setup col-size + col_size_.resize(info_.num_col()); + std::fill(col_size_.begin(), col_size_.end(), 0); + for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) { + SparsePage *pcol = col_iter_.cpages_[i]; + for (size_t j = 0; j < pcol->Size(); ++j) { + col_size_[j] += pcol->offset[j + 1] - pcol->offset[j]; + } + } + } + /*! + * \brief make column page from iterator + * \param pkeep probability to keep a row + * \param pcol the target column + */ + inline void MakeOneBatch(const std::vector &enabled, + float pkeep, + SparsePage *pcol) { // clear rowset buffered_rowset_.clear(); // bit map @@ -157,8 +150,9 @@ class FMatrixS : public IFMatrix { { nthread = omp_get_num_threads(); } - // build the column matrix in parallel - utils::ParallelGroupBuilder builder(&col_ptr_, &col_data_); + pcol->Clear(); + utils::ParallelGroupBuilder + builder(&pcol->offset, &pcol->data); builder.InitBudget(0, nthread); // start working iter_->BeforeFirst(); @@ -189,7 +183,7 @@ class FMatrixS : public IFMatrix { } } builder.InitStorage(); - + iter_->BeforeFirst(); while (iter_->Next()) { const RowBatch &batch = iter_->Value(); @@ -209,66 +203,167 @@ class FMatrixS : public IFMatrix { } } } + + utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data"); // sort columns - bst_omp_uint ncol = static_cast(this->NumCol()); - #pragma omp parallel for schedule(static) + bst_omp_uint ncol = static_cast(pcol->Size()); + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) for (bst_omp_uint i = 0; i < ncol; ++i) { - if (col_ptr_[i] < col_ptr_[i + 1]) { - std::sort(BeginPtr(col_data_) + col_ptr_[i], - BeginPtr(col_data_) + col_ptr_[i + 1], Entry::CmpValue); + if (pcol->offset[i] < pcol->offset[i + 1]) { + std::sort(BeginPtr(pcol->data) + pcol->offset[i], + BeginPtr(pcol->data) + pcol->offset[i + 1], + SparseBatch::Entry::CmpValue); + } + } + } + + inline void MakeManyBatch(const std::vector &enabled, + float pkeep, size_t max_row_perbatch) { + size_t btop = 0; + buffered_rowset_.clear(); + // internal temp cache + SparsePage tmp; tmp.Clear(); + iter_->BeforeFirst(); + while (iter_->Next()) { + const RowBatch &batch = iter_->Value(); + for (size_t i = 0; i < batch.size; ++i) { + bst_uint ridx = static_cast(batch.base_rowid + i); + if (pkeep == 1.0f || random::SampleBinary(pkeep)) { + buffered_rowset_.push_back(ridx); + tmp.Push(batch[i]); + } + if (tmp.Size() >= max_row_perbatch) { + SparsePage *page = new SparsePage(); + this->MakeColPage(tmp.GetRowBatch(0), + BeginPtr(buffered_rowset_) + btop, + enabled, page); + col_iter_.cpages_.push_back(page); + btop = buffered_rowset_.size(); + tmp.Clear(); + } + } + } + if (tmp.Size() != 0) { + SparsePage *page = new SparsePage(); + this->MakeColPage(tmp.GetRowBatch(0), + BeginPtr(buffered_rowset_) + btop, + enabled, page); + col_iter_.cpages_.push_back(page); + } + } + // make column page from subset of rowbatchs + inline void MakeColPage(const RowBatch &batch, + const bst_uint *ridx, + const std::vector &enabled, + SparsePage *pcol) { + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); + if (nthread > max_nthread) { + nthread = max_nthread; + } + } + pcol->Clear(); + utils::ParallelGroupBuilder + builder(&pcol->offset, &pcol->data); + builder.InitBudget(info_.num_col(), nthread); + bst_omp_uint ndata = static_cast(batch.size); + #pragma omp parallel for schedule(static) num_threads(nthread) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + const SparseBatch::Entry &e = inst[j]; + if (enabled[e.index]) { + builder.AddBudget(e.index, tid); + } + } + } + builder.InitStorage(); + #pragma omp parallel for schedule(static) num_threads(nthread) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + const SparseBatch::Entry &e = inst[j]; + builder.Push(e.index, + SparseBatch::Entry(ridx[i], e.fvalue), + tid); + } + } + utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data"); + // sort columns + bst_omp_uint ncol = static_cast(pcol->Size()); + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) + for (bst_omp_uint i = 0; i < ncol; ++i) { + if (pcol->offset[i] < pcol->offset[i + 1]) { + std::sort(BeginPtr(pcol->data) + pcol->offset[i], + BeginPtr(pcol->data) + pcol->offset[i + 1], + SparseBatch::Entry::CmpValue); } } } private: // one batch iterator that return content in the matrix - struct OneBatchIter: utils::IIterator { - OneBatchIter(void) : at_first_(true){} - virtual ~OneBatchIter(void) {} + struct ColBatchIter: utils::IIterator { + ColBatchIter(void) : data_ptr_(0) {} + virtual ~ColBatchIter(void) { + this->Clear(); + } virtual void BeforeFirst(void) { - at_first_ = true; + data_ptr_ = 0; } virtual bool Next(void) { - if (!at_first_) return false; - at_first_ = false; - return true; - } - virtual const ColBatch &Value(void) const { - return batch_; - } - inline void SetBatch(const std::vector &ptr, - const std::vector &data) { + if (data_ptr_ >= cpages_.size()) return false; + data_ptr_ += 1; + SparsePage *pcol = cpages_[data_ptr_ - 1]; batch_.size = col_index_.size(); col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0)); for (size_t i = 0; i < col_data_.size(); ++i) { const bst_uint ridx = col_index_[i]; - col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx], - static_cast(ptr[ridx+1] - ptr[ridx])); + col_data_[i] = SparseBatch::Inst + (BeginPtr(pcol->data) + pcol->offset[ridx], + static_cast(pcol->offset[ridx + 1] - pcol->offset[ridx])); } batch_.col_index = BeginPtr(col_index_); - batch_.col_data = BeginPtr(col_data_); - this->BeforeFirst(); + batch_.col_data = BeginPtr(col_data_); + return true; + } + virtual const ColBatch &Value(void) const { + return batch_; + } + inline void Clear(void) { + for (size_t i = 0; i < cpages_.size(); ++i) { + delete cpages_[i]; + } + cpages_.clear(); } // data content std::vector col_index_; + // column content std::vector col_data_; - // whether is at first - bool at_first_; + // column sparse pages + std::vector cpages_; + // data pointer + size_t data_ptr_; // temporal space for batch ColBatch batch_; - }; + }; // --- data structure used to support InitColAccess -- // column iterator - OneBatchIter col_iter_; + ColBatchIter col_iter_; + // shared meta info with DMatrix + const learner::MetaInfo &info_; // row iterator utils::IIterator *iter_; /*! \brief list of row index that are buffered */ std::vector buffered_rowset_; - /*! \brief column pointer of CSC format */ - std::vector col_ptr_; - /*! \brief column datas in CSC format */ - std::vector col_data_; + // count for column data + std::vector col_size_; }; } // namespace io } // namespace xgboost -#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP +#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h index 319f9da5c..d94141a6e 100644 --- a/src/io/sparse_batch_page.h +++ b/src/io/sparse_batch_page.h @@ -178,8 +178,22 @@ class SparsePage { offset.push_back(offset.back() + inst.length); size_t begin = data.size(); data.resize(begin + inst.length); - std::memcpy(BeginPtr(data) + begin, inst.data, - sizeof(SparseBatch::Entry) * inst.length); + if (inst.length != 0) { + std::memcpy(BeginPtr(data) + begin, inst.data, + sizeof(SparseBatch::Entry) * inst.length); + } + } + /*! + * \param base_rowid base_rowid of the data + * \return row batch representation of the page + */ + inline RowBatch GetRowBatch(size_t base_rowid) const { + RowBatch out; + out.base_rowid = base_rowid; + out.ind_ptr = BeginPtr(offset); + out.data_ptr = BeginPtr(data); + out.size = offset.size() - 1; + return out; } private: diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 5a080d5b1..45e312aa7 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -33,6 +33,7 @@ class BoostLearner : public rabit::Serializable { silent= 0; prob_buffer_row = 1.0f; distributed_mode = 0; + updater_mode = 0; pred_buffer_size = 0; seed_per_iteration = 0; seed = 0; @@ -95,6 +96,7 @@ class BoostLearner : public rabit::Serializable { utils::Error("%s is invalid value for dsplit, should be row or col", val); } } + if (!strcmp(name, "updater_mode")) updater_mode = atoi(val); if (!strcmp(name, "prob_buffer_row")) { prob_buffer_row = static_cast(atof(val)); utils::Check(distributed_mode == 0, @@ -259,9 +261,17 @@ class BoostLearner : public rabit::Serializable { */ inline void CheckInit(DMatrix *p_train) { int ncol = static_cast(p_train->info.info.num_col); - std::vector enabled(ncol, true); + std::vector enabled(ncol, true); + // set max row per batch to limited value + // in distributed mode, use safe choice otherwise + size_t max_row_perbatch = std::numeric_limits::max(); + if (updater_mode != 0 || distributed_mode == 2) { + max_row_perbatch = 32UL << 10UL; + } // initialize column access - p_train->fmat()->InitColAccess(enabled, prob_buffer_row); + p_train->fmat()->InitColAccess(enabled, + prob_buffer_row, + max_row_perbatch); const int kMagicPage = 0xffffab02; // check, if it is DMatrixPage, then use hist maker if (p_train->magic == kMagicPage) { @@ -480,6 +490,8 @@ class BoostLearner : public rabit::Serializable { int silent; // distributed learning mode, if any, 0:none, 1:col, 2:row int distributed_mode; + // updater mode, 0:normal, reserved for internal test + int updater_mode; // cached size of predict buffer size_t pred_buffer_size; // maximum buffred row value