Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity

This commit is contained in:
tqchen 2014-09-08 16:20:48 -07:00
commit a3d5930f26
67 changed files with 1282 additions and 8708 deletions

View File

@ -48,7 +48,8 @@ Rpack:
rm -rf xgboost/inst/examples/*.model
rm -rf xgboost/inst/examples/dump*
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper

View File

@ -1,7 +1,7 @@
Package: xgboost
Type: Package
Title: eXtreme Gradient Boosting
Version: 0.3-1
Version: 0.3-2
Date: 2014-08-23
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
Maintainer: Tong He <hetong007@gmail.com>
@ -18,7 +18,7 @@ License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/tqchen/xgboost
BugReports: https://github.com/tqchen/xgboost/issues
Depends:
R (>= 2.0.2)
R (>= 2.10)
Imports:
Matrix (>= 1.1-0),
methods

View File

@ -1,9 +1,11 @@
# Generated by roxygen2 (4.0.1): do not edit by hand
export(getinfo)
export(setinfo)
export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.save)
export(xgb.cv)
export(xgb.dump)
export(xgb.load)
export(xgb.save)

View File

@ -5,10 +5,13 @@ setClass('xgb.DMatrix')
#' Get information of an xgb.DMatrix object
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' labels <- getinfo(dtrain, "label")
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
#'

View File

@ -11,12 +11,17 @@ setClass("xgb.Booster")
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
#' set it to be value bigger than 0. It will use all trees by default.
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#' @export
#'
setMethod("predict", signature = "xgb.Booster",

View File

@ -0,0 +1,29 @@
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname setinfo
#' @export
#'
setinfo <- function(object, ...){
UseMethod("setinfo")
}
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#' @rdname setinfo
#' @method setinfo xgb.DMatrix
setMethod("setinfo", signature = "xgb.DMatrix",
definition = function(object, name, info) {
xgb.setinfo(object, name, info)
})

View File

@ -7,9 +7,9 @@ setClass('xgb.DMatrix')
#' orginal xgb.DMatrix object
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' dsub <- slice(dtrain, 1:3)
#' @rdname slice
#' @export

View File

@ -52,7 +52,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p),
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost")
}
}
@ -65,36 +65,34 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
return(structure(handle, class = "xgb.Booster"))
}
# predict, depreciated
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.predict: first argument must be type xgb.Booster")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix")
}
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin),
PACKAGE = "xgboost")
return(ret)
}
## ----the following are low level iteratively function, not needed if
## you do not want to use them ---------------------------------------
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
# get dmatrix from data, label
xgb.get.DMatrix <- function(data, label = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label)
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else {
stop("xgboost: Invalid input of data")
}
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost")
return(TRUE)
return (dtrain)
}
xgb.numrow <- function(dmat) {
nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost")
return(nrow)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") {
@ -108,8 +106,28 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
return(TRUE)
}
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost")
} else {
pred <- predict(booster, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(booster, dtrain, gpair)
}
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter) {
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
@ -122,18 +140,75 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
}
}
if (length(watchlist) != 0) {
evnames <- list()
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
if (is.null(feval)) {
evnames <- list()
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- paste("[", iter, "]", sep="")
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(predict(booster, w[[1]]), w[[1]])
msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
}
evnames <- append(evnames, names(w))
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- ""
}
return(msg)
}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param) {
randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) / nfold
idset <- list()
for (i in 1:nfold) {
idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
}
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
didx = c()
for (i in 1:nfold) {
if (i != k) {
didx <- append(didx, idset[[i]])
}
}
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
}
return (ret)
}
xgb.cv.aggcv <- function(res, showsd = TRUE) {
header <- res[[1]]
ret <- header[1]
for (i in 2:length(header)) {
kv <- strsplit(header[i], ":")[[1]]
ret <- paste(ret, "\t", kv[1], ":", sep="")
stats <- c()
stats[1] <- as.numeric(kv[2])
for (j in 2:length(res)) {
tkv <- strsplit(res[[j]][i], ":")[[1]]
stats[j] <- as.numeric(tkv[2])
}
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
if (showsd) {
ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
}
}
return (ret)
}

View File

@ -11,11 +11,11 @@
#' @param ... other information to pass to \code{info}.
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
#'
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {

View File

@ -2,15 +2,15 @@
#'
#' Save xgb.DMatrix object to binary file
#'
#' @param DMatrix the model object.
#' @param DMatrix the DMatrix object
#' @param fname the name of the binary file.
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
#'
xgb.DMatrix.save <- function(DMatrix, fname) {
@ -22,6 +22,6 @@ xgb.DMatrix.save <- function(DMatrix, fname) {
PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
stop("xgb.DMatrix.save: the input must be xgb.DMatrix")
return(FALSE)
}

86
R-package/R/xgb.cv.R Normal file
View File

@ -0,0 +1,86 @@
#' Cross Validation
#'
#' The cross valudation function of xgboost
#'
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
#' further details. See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
#' @param showsd boolean, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
#' Possible options are:
#' \itemize{
#' \item \code{error} binary classification error rate
#' \item \code{rmse} Rooted mean square error
#' \item \code{logloss} negative log-likelihood function
#' \item \code{auc} Area under curve
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
#' }
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the cross validation function for xgboost
#'
#' Parallelization is automatically enabled if OpenMP is present.
#' Number of threads can also be manually specified via "nthread" parameter.
#'
#' This function only accepts an \code{xgb.DMatrix} object as the input.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
#' "max.depth"=3, "eta"=1, "objective"="binary:logistic")
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL,
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
dtrain <- xgb.get.DMatrix(data, label)
params <- append(params, list(...))
params <- append(params, list(silent=1))
for (mc in metrics) {
params <- append(params, list("eval_metric"=mc))
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
history <- list()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval),
"\t")[[1]]
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- append(history, ret)
cat(paste(ret, "\n", sep=""))
}
return (TRUE)
}

View File

@ -7,14 +7,18 @@
#' @param fmap feature map file representing the type of feature.
#' Detailed description could be found at
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
#' Run inst/examples/demo.R for the result and inst/examples/featmap.txt
#' See demo/ for walkthrough example in R, and
#' \url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
#' for example Format.
#'
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.dump(bst, 'iris.xgb.model.dump')
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump')
#' @export
#'
xgb.dump <- function(model, fname, fmap = "") {

View File

@ -5,11 +5,15 @@
#' @param modelfile the name of the binary file.
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.save(bst, 'iris.xgb.model')
#' bst <- xgb.load('iris.xgb.model')
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
#'
xgb.load <- function(modelfile) {

View File

@ -6,11 +6,15 @@
#' @param fname the name of the binary file.
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' xgb.save(bst, 'iris.xgb.model')
#' bst <- xgb.load('iris.xgb.model')
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
#'
xgb.save <- function(model, fname) {

View File

@ -10,13 +10,13 @@
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max_depth} maximum depth of the tree
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
#' further details. See also inst/examples/demo.R for walkthrough example in R.
#' @param dtrain takes an \code{xgb.DMatrix} as the input.
#' further details. See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param watchlist what information should be printed when \code{verbose=1} or
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
@ -29,6 +29,9 @@
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#'
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
@ -43,12 +46,11 @@
#'
#'
#' @examples
#' data(iris)
#' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- dtrain
#' watchlist <- list(eval = dtest, train = dtrain)
#' param <- list(max_depth = 2, eta = 1, silent = 1)
#' param <- list(max.depth = 2, eta = 1, silent = 1)
#' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds))
@ -64,48 +66,32 @@
#' bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
#' @export
#'
xgb.train <- function(params=list(), dtrain, nrounds, watchlist = list(),
obj = NULL, feval = NULL, ...) {
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, ...) {
dtrain <- data
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix")
}
if (verbose > 1) {
params <- append(params, list(silent = 0))
} else {
params <- append(params, list(silent = 1))
}
if (length(watchlist) != 0 && verbose == 0) {
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
watchlist <- list()
}
params = append(params, list(...))
bst <- xgb.Booster(params, append(watchlist, dtrain))
for (i in 1:nrounds) {
if (is.null(obj)) {
succ <- xgb.iter.update(bst, dtrain, i - 1)
} else {
pred <- xgb.predict(bst, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair)
}
succ <- xgb.iter.update(bst, dtrain, i - 1, obj)
if (length(watchlist) != 0) {
if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i - 1)
cat(msg)
cat("\n")
} else {
cat("[")
cat(i)
cat("]")
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t")
cat(names(w))
cat("-")
cat(ret$metric)
cat(":")
cat(ret$value)
}
cat("\n")
}
msg <- xgb.iter.eval(bst, watchlist, i - 1, feval)
cat(paste(msg, "\n", sep=""))
}
}
return(bst)

View File

@ -14,12 +14,12 @@
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max_depth} maximum depth of the tree
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
#' further details. See also inst/examples/demo.R for walkthrough example in R.
#' further details. See also demo/ for walkthrough example in R.
#' @param nrounds the max number of iterations
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
@ -33,39 +33,83 @@
#' Number of threads can also be manually specified via "nthread" parameter
#'
#' @examples
#' data(iris)
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
#' pred <- predict(bst, as.matrix(iris[,1:4]))
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#'
#' @export
#'
xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
verbose = 1, ...) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label))
stop("xgboost: need label when data is a matrix")
dtrain <- xgb.DMatrix(data, label = label)
} else {
if (!is.null(label))
warning("xgboost: label will be ignored.")
if (inClass == "character")
dtrain <- xgb.DMatrix(data) else if (inClass == "xgb.DMatrix")
dtrain <- data else stop("xgboost: Invalid input of data")
}
if (verbose > 1) {
silent <- 0
} else {
silent <- 1
}
params <- append(params, list(silent = silent))
dtrain <- xgb.get.DMatrix(data, label)
params <- append(params, list(...))
if (verbose > 0)
watchlist <- list(train = dtrain) else watchlist <- list()
if (verbose > 0) {
watchlist <- list(train = dtrain)
} else {
watchlist <- list()
}
bst <- xgb.train(params, dtrain, nrounds, watchlist)
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
return(bst)
}
#' Training part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.train
#' @usage data(agaricus.train)
#' @format A list containing a label vector, and a dgCMatrix object with 6513
#' rows and 127 variables
NULL
#' Test part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.test
#' @usage data(agaricus.test)
#' @format A list containing a label vector, and a dgCMatrix object with 1611
#' rows and 127 variables
NULL

View File

@ -17,5 +17,5 @@ install.packages('xgboost')
## Examples
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example.
* Please visit [walk through example](https://github.com/tqchen/xgboost/blob/master/R-package/demo).
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.

Binary file not shown.

Binary file not shown.

6
R-package/demo/00Index Normal file
View File

@ -0,0 +1,6 @@
basic_walkthrough Basic feature walkthrough
custom_objective Cutomize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation

17
R-package/demo/README.md Normal file
View File

@ -0,0 +1,17 @@
XGBoost R Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Cutomize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.R)
* [Generalized Linear Model](generalized_linear_model.R)
* [Cross validation](cross_validation.R)
Benchmarks
====
* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs)
Notes
====
* Contribution of exampls, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)

View File

@ -0,0 +1,93 @@
require(xgboost)
require(methods)
# we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom can be eated
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(train$label)
class(train$data)
#-------------Basic Training using XGBoost-----------------
# this is the basic usage of xgboost you can put matrix in data field
# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
print("training xgboost with sparseMatrix")
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# alternatively, you can put in dense matrix, i.e. basic R-matrix
print("training xgboost with Matrix")
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
print("training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
# Verbose = 0,1,2
print ('train xgboost with verbose 0, no message')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 0)
print ('train xgboost with verbose 1, print evaluation metric')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 1)
print ('train xgboost with verbose 2, also print information about tree')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LibSVM format input
# since we do not have this file with us, the following line is just for illustration
# bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
#--------------------basic prediction using xgboost--------------
# you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
#-------------------save and load models-------------------------
# save model to binary local file
xgb.save(bst, "xgboost.model")
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
#----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
#---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them tagged with name
watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list
print ('train xgboost using xgb.train with watchlist')
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics
print ('train xgboost using xgb.train with watchlist, watch logloss and error')
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
eval.metric = "error", eval.metric = "logloss",
objective = "binary:logistic")
# xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
# Finally, you can dump the tree you learned using xgb.dump into a text file
xgb.dump(bst, "dump.raw.txt")

View File

@ -0,0 +1,26 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain)
###
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# train xgboost for 1 round
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin=TRUE)
# set the base_margin property of dtrain and dtest
# base margin is the base prediction we will boost from
setinfo(dtrain, "base_margin", ptrain)
setinfo(dtest, "base_margin", ptest)
print('this is result of boost from initial prediction')
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -0,0 +1,47 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nround <- 2
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5,
metrics={'error'}, , showsd = FALSE)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth=2,eta=1,silent=1)
# train with customized objective
xgb.cv(param, dtrain, nround, nfold = 5,
obj = logregobj, feval=evalerror)

View File

@ -0,0 +1,39 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max.depth=2,eta=1,silent=1)
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
print ('start training with user customized objective')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)

View File

@ -0,0 +1,34 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
# you can fit a linear regression, or logistic regression model
##
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear",
alpha = 0.0001, lambda = 1)
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
##
# the rest of settings are the same
##
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')

View File

@ -0,0 +1,23 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 2
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, watchlist)
cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest,'label')
### predict using first 1 tree
ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees
ypred2 = predict(bst, dtest)
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

8
R-package/demo/runall.R Normal file
View File

@ -0,0 +1,8 @@
# running all scripts in demo folder
demo(basic_walkthrough)
demo(custom_objective)
demo(boost_from_prediction)
demo(predict_first_ntree)
demo(generalized_linear_model)
demo(cross_validation)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,153 +0,0 @@
require(xgboost)
require(methods)
# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose adopted from
# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol + 1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i, findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label = label, data = mat))
}
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
# Directly read in local file
dtrain <- xgb.DMatrix("agaricus.txt.train")
class(dtrain)
# read file in R
csc <- read.libsvm("agaricus.txt.train", 126)
y <- csc$label
x <- csc$data
# x as Sparse Matrix
class(x)
dtrain <- xgb.DMatrix(x, label = y)
# x as dense matrix
dense.x <- as.matrix(x)
dtrain <- xgb.DMatrix(dense.x, label = y)
############################ Test xgboost with local file, sparse matrix and dense matrix in R.
# Test with DMatrix object
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# Verbose = 0,1,2
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 0)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 1)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 2)
# Test with local file
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
objective = "binary:logistic")
# Test with Sparse Matrix
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
# Test with dense Matrix
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
objective = "binary:logistic")
############################ Test predict
# Prediction with DMatrix object
dtest <- xgb.DMatrix("agaricus.txt.test")
pred <- predict(bst, dtest)
# Prediction with local test file
pred <- predict(bst, "agaricus.txt.test")
# Prediction with Sparse Matrix
csc <- read.libsvm("agaricus.txt.test", 126)
test.y <- csc$label
test.x <- csc$data
pred <- predict(bst, test.x)
# Extrac label with getinfo
labels <- getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=", err))
############################ Save and load model to hard disk
# save model to binary local file
xgb.save(bst, "xgboost.model")
# load binary model to R
bst <- xgb.load("xgboost.model")
pred <- predict(bst, test.x)
# save model to text file
xgb.dump(bst, "dump.raw.txt")
# save model to text file, with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save a DMatrix object to hard disk
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# load a DMatrix object to R
dtrain <- xgb.DMatrix("dtrain.buffer")
############################ More flexible training function xgb.train
param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
watchlist <- list(eval = dtest, train = dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
############################ cutomsized loss function
param <- list(max_depth = 2, eta = 1, silent = 1)
# note: for customized objective function, we leave objective as default note: what we are getting is
# margin value in prediction you must know what you are doing
# user define objective function, given prediction, return gradient and second order gradient this is
# loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
# you do customized loss function, the default prediction value is margin this may make buildin
# evalution metric not function properly for example, we are doing logistic loss, the prediction is
# score before logistic transformation the buildin evaluation error assumes input is after logistic
# transformation Take this in mind when you use the customization, and maybe you need write customized
# evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
# training with customized objective, we can also do step by step training simply look at xgboost.py's
# implementation of train
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)

View File

@ -1,126 +0,0 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -0,0 +1,31 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}
\title{Test part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 1611
rows and 127 variables}
\usage{
data(agaricus.test)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

View File

@ -0,0 +1,31 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}
\title{Training part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 6513
rows and 127 variables}
\usage{
data(agaricus.train)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

View File

@ -20,9 +20,12 @@ getinfo(object, ...)
Get information of an xgb.DMatrix object
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
labels <- getinfo(dtrain, "label")
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

View File

@ -18,15 +18,20 @@ value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
set it to be value bigger than 0. It will use all trees by default.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
only valid for gbtree, but not for gblinear. set it to be value bigger
than 0. It will use all trees by default.}
}
\description{
Predicted values based on xgboost model object.
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
pred <- predict(bst, as.matrix(iris[,1:4]))
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
pred <- predict(bst, test$data)
}

33
R-package/man/setinfo.Rd Normal file
View File

@ -0,0 +1,33 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{setinfo}
\alias{setinfo}
\alias{setinfo,xgb.DMatrix-method}
\title{Set information of an xgb.DMatrix object}
\usage{
setinfo(object, ...)
\S4method{setinfo}{xgb.DMatrix}(object, name, info)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
\item{...}{other parameters}
}
\description{
Set information of an xgb.DMatrix object
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

View File

@ -22,9 +22,9 @@ Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
dsub <- slice(dtrain, 1:3)
}

View File

@ -19,10 +19,10 @@ indicating the data file.}
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

View File

@ -6,7 +6,7 @@
xgb.DMatrix.save(DMatrix, fname)
}
\arguments{
\item{DMatrix}{the model object.}
\item{DMatrix}{the DMatrix object}
\item{fname}{the name of the binary file.}
}
@ -14,10 +14,10 @@ xgb.DMatrix.save(DMatrix, fname)
Save xgb.DMatrix object to binary file
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

72
R-package/man/xgb.cv.Rd Normal file
View File

@ -0,0 +1,72 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
metrics = list(), obj = NULL, feval = NULL, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
further details. See also demo/ for walkthrough example in R.}
\item{data}{takes an \code{xgb.DMatrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{nfold}{number of folds used}
\item{label}{option field, when data is Matrix}
\item{showsd}{boolean, whether show standard deviation of cross validation}
\item{metrics,}{list of evaluation metrics to be used in corss validation,
when it is not specified, the evaluation metric is chosen according to objective function.
Possible options are:
\itemize{
\item \code{error} binary classification error rate
\item \code{rmse} Rooted mean square error
\item \code{logloss} negative log-likelihood function
\item \code{auc} Area under curve
\item \code{merror} Exact matching error, used to evaluate multi-class classification
}}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain,}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain,}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
The cross valudation function of xgboost
}
\details{
This is the cross validation function for xgboost
Parallelization is automatically enabled if OpenMP is present.
Number of threads can also be manually specified via "nthread" parameter.
This function only accepts an \code{xgb.DMatrix} object as the input.
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
"max.depth"=3, "eta"=1, "objective"="binary:logistic")
}

View File

@ -13,15 +13,20 @@ xgb.dump(model, fname, fmap = "")
\item{fmap}{feature map file representing the type of feature.
Detailed description could be found at
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
Run inst/examples/demo.R for the result and inst/examples/featmap.txt
See demo/ for walkthrough example in R, and
\url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
for example Format.}
}
\description{
Save a xgboost model to text file. Could be parsed later.
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.dump(bst, 'iris.xgb.model.dump')
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump')
}

View File

@ -12,10 +12,14 @@ xgb.load(modelfile)
Load xgboost model from the binary model file
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.save(bst, 'iris.xgb.model')
bst <- xgb.load('iris.xgb.model')
pred <- predict(bst, as.matrix(iris[,1:4]))
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@ -14,10 +14,14 @@ xgb.save(model, fname)
Save xgboost model from xgboost or xgb.train
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
xgb.save(bst, 'iris.xgb.model')
bst <- xgb.load('iris.xgb.model')
pred <- predict(bst, as.matrix(iris[,1:4]))
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@ -3,8 +3,8 @@
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}
\usage{
xgb.train(params = list(), dtrain, nrounds, watchlist = list(),
obj = NULL, feval = NULL, ...)
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
feval = NULL, verbose = 1, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
@ -15,14 +15,14 @@ xgb.train(params = list(), dtrain, nrounds, watchlist = list(),
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
further details. See also inst/examples/demo.R for walkthrough example in R.}
further details. See also demo/ for walkthrough example in R.}
\item{dtrain}{takes an \code{xgb.DMatrix} as the input.}
\item{data}{takes an \code{xgb.DMatrix} as the input.}
\item{nrounds}{the max number of iterations}
@ -39,6 +39,9 @@ gradient with given prediction and dtrain,}
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain,}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
@ -55,12 +58,11 @@ It supports advanced features such as watchlist, customized objective function,
therefore it is more flexible than \code{\link{xgboost}}.
}
\examples{
data(iris)
iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- dtrain
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max_depth = 2, eta = 1, silent = 1)
param <- list(max.depth = 2, eta = 1, silent = 1)
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))

View File

@ -20,12 +20,12 @@ xgboost(data = NULL, label = NULL, params = list(), nrounds,
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
further details. See also inst/examples/demo.R for walkthrough example in R.}
further details. See also demo/ for walkthrough example in R.}
\item{nrounds}{the max number of iterations}
@ -45,8 +45,12 @@ Parallelization is automatically enabled if OpenMP is present.
Number of threads can also be manually specified via "nthread" parameter
}
\examples{
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
pred <- predict(bst, as.matrix(iris[,1:4]))
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
pred <- predict(bst, test$data)
}

View File

@ -6,10 +6,9 @@
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
#include "src/utils/matrix_csr.h"
#include "xgboost_R.h"
using namespace std;
using namespace xgboost;
extern "C" {
@ -92,37 +91,25 @@ extern "C" {
SEXP indices,
SEXP data) {
_WrapperBegin();
const int *col_ptr = INTEGER(indptr);
const int *row_index = INTEGER(indices);
const double *col_data = REAL(data);
int ncol = length(indptr) - 1;
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data);
// transform into CSR format
std::vector<bst_ulong> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(row_index[j]);
}
std::vector<bst_ulong> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (int i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
}
builder.InitStorage();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
}
}
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
std::vector<float> row_data(ndata);
std::vector<unsigned> col_index(ndata);
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
col_index[i] = csr_data[i].first;
row_data[i] = csr_data[i].second;
indices_[i] = static_cast<unsigned>(p_indices[i]);
data_[i] = static_cast<float>(p_data[i]);
}
void *handle = XGDMatrixCreateFromCSR(BeginPtr(row_ptr), BeginPtr(col_index),
BeginPtr(row_data), row_ptr.size(), ndata );
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata);
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
@ -188,6 +175,10 @@ extern "C" {
_WrapperEnd();
return ret;
}
SEXP XGDMatrixNumRow_R(SEXP handle) {
bst_ulong nrow = XGDMatrixNumRow(R_ExternalPtrAddr(handle));
return ScalarInteger(static_cast<int>(nrow));
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;

View File

@ -65,6 +65,11 @@ extern "C" {
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief return number of rows
* \param handle a instance of data matrix
*/
SEXP XGDMatrixNumRow_R(SEXP handle);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached

View File

@ -73,18 +73,22 @@ and ranking. The package is made to be extendible, so that users are also allowe
\end{enumerate}
\section{Example with iris}
\section{Example with Mushroom data}
In this section, we will illustrate some common usage of \verb@xgboost@.
In this section, we will illustrate some common usage of \verb@xgboost@. The
Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013}
<<Training and prediction with iris>>=
library(xgboost)
data(iris)
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
nrounds = 5)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1,
nround = 2, objective = "binary:logistic")
xgb.save(bst, 'model.save')
bst = xgb.load('model.save')
pred <- predict(bst, as.matrix(iris[,1:4]))
pred <- predict(bst, test$data)
@
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
@ -101,17 +105,19 @@ The output looks like
\begin{verbatim}
booster[0]:
0:[f2<2.45] yes=1,no=2,missing=1
1:leaf=0.147059
2:[f3<1.65] yes=3,no=4,missing=3
3:leaf=0.464151
4:leaf=0.722449
0:[f28<1.00001] yes=1,no=2,missing=2
1:[f108<1.00001] yes=3,no=4,missing=4
3:leaf=1.85965
4:leaf=-1.94071
2:[f55<1.00001] yes=5,no=6,missing=6
5:leaf=-1.70044
6:leaf=1.71218
booster[1]:
0:[f2<2.45] yes=1,no=2,missing=1
1:leaf=0.103806
2:[f2<4.85] yes=3,no=4,missing=3
3:leaf=0.316341
4:leaf=0.510365
0:[f59<1.00001] yes=1,no=2,missing=2
1:leaf=-6.23624
2:[f28<1.00001] yes=3,no=4,missing=4
3:leaf=-0.96853
4:leaf=0.784718
\end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
@ -120,18 +126,16 @@ training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>=
iris.mat <- as.matrix(iris[,1:4])
iris.label <- as.numeric(iris[,5])
diris <- xgb.DMatrix(iris.mat, label = iris.label)
class(diris)
getinfo(diris,'label')
dtrain <- xgb.DMatrix(train$data, label = train$label)
class(dtrain)
head(getinfo(dtrain,'label'))
@
We can also save the matrix to a binary file. Then load it simply with
\verb@xgb.DMatrix@
<<save model>>=
xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
diris = xgb.DMatrix('iris.xgb.DMatrix')
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
dtrain = xgb.DMatrix('xgb.DMatrix')
@
\section{Advanced Examples}
@ -156,11 +160,11 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "MSE", value = err))
}
dtest <- slice(diris,1:100)
watchlist <- list(eval = dtest, train = diris)
param <- list(max_depth = 2, eta = 1, silent = 1)
dtest <- xgb.DMatrix(test$data, label = test$label)
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1)
bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
@
The gradient and second order gradient is required for the output of customized
@ -169,7 +173,7 @@ objective function.
We also have \verb@slice@ for row extraction. It is useful in
cross-validation.
For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further
For a walkthrough demo, please see \verb@R-package/demo/@ for further
details.
\section{The Higgs Boson competition}

View File

@ -18,3 +18,13 @@
publisher={Institute of Mathematical Statistics}
}
@misc{
Bache+Lichman:2013 ,
author = "K. Bache and M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences"
}

View File

@ -1,6 +1,6 @@
xgboost: eXtreme Gradient Boosting
======
An optimized general purpose gradient boosting library. It implements machine learning algorithm under gradient boosting framework, including an efficient linear model solver and gradient boosted regression tree.
An optimized general purpose gradient boosting library. The library is parallelized using OpenMP. It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree.
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
@ -8,10 +8,15 @@ Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.com/tqchen/xgboost/issues?q=is%3Aissue+label%3Aquestion)
Examples Code: [demo folder](demo)
Examples Code: [Learning to use xgboost by examples](demo)
Notes on the Code: [Code Guide](src)
What's New
=====
* See the updated [demo folder](demo) for feature walkthrough
* Thanks to Tong He, the new [R package](R-package) is available
Features
======
* Sparse feature format:
@ -24,8 +29,8 @@ Features
* Python interface, works with numpy and scipy.sparse matrix
Build
======
* Simply type make
=====
* Run ```bash build.sh``` (you can also type make)
* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
* You may get a error: -lgomp is not found
- You can type ```make no_omp=1```, this will get you single thread xgboost

15
build.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash
# this is a simple script to make xgboost in MAC nad Linux
# basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
# This will automatically make xgboost for MAC users who do not have openmp support
# In most cases, type make will give what you want
if make; then
echo "Successfully build multi-thread xgboost"
else
echo "-----------------------------"
echo "Building multi-thread xgboost failed"
echo "Start to build single-thread xgboost"
make clean
make no_omp=1
echo "Successfully build single-thread xgboost"
fi

View File

@ -1,17 +1,19 @@
XGBoost Examples
====
This folder contains the all example codes using xgboost.
Contribution of exampls, benchmarks is more than welcomed!
If you like to share how you use xgboost to solve your problem, send a pull request:)
* Contribution of exampls, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)
Features Walkthrough
====
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
* Basic walkthrough of wrappers. [python](guide-python/basic_walkthrough.py)
* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py)
* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py)
* Predicting using first n trees. [python](guide-python/predict_first_ntree.py)
* Cross validation(to come)
* Basic walkthrough of wrappers [python](guide-python/basic_walkthrough.py)
* Cutomize loss function, and evaluation metric [python](guide-python/custom_objective.py)
* Boosting from existing prediction [python](guide-python/boost_from_prediction.py)
* Predicting using first n trees [python](guide-python/predict_first_ntree.py)
* Generalized Linear Model [python](guide-python/generalized_linear_model.py)
* Cross validation [python](guide-python/cross_validation.py)
Basic Examples by Tasks
====

View File

@ -1,3 +0,0 @@
XGBoost R Feature Walkthrough
====
To be finished

View File

@ -1,5 +0,0 @@
#!/bin/bash
# todo
Rscript basic_walkthrough.R
Rscript custom_objective.R
Rscript boost_from_prediction.R

View File

@ -4,3 +4,5 @@ XGBoost Python Feature Walkthrough
* [Cutomize loss function, and evaluation metric](custom_objective.py)
* [Boosting from existing prediction](boost_from_prediction.py)
* [Predicting using first n trees](predict_first_ntree.py)
* [Generalized Linear Model](generalized_linear_model.py)
* [Cross validation](cross_validation.py)

View File

@ -42,7 +42,7 @@ assert np.sum(np.abs(preds2-preds)) == 0
###
# build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse')
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
labels = []
row = []; col = []; dat = []
i = 0
@ -54,16 +54,22 @@ for l in open('../data/agaricus.txt.train'):
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr )
dtrain.set_label(labels)
dtrain = xgb.DMatrix( csr, label = labels )
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
# we can also construct from csc matrix
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix(csc, label=labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
# then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat)
dtrain.set_label(labels)
dtrain = xgb.DMatrix(npymat, label = labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )

View File

@ -0,0 +1,63 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

View File

@ -0,0 +1,32 @@
#!/usr/bin/python
import sys
sys.path.append('../../wrapper')
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1 }
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1
##
# the rest of settings are the same
##
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

View File

@ -2,4 +2,6 @@
python basic_walkthrough.py
python custom_objective.py
python boost_from_prediction.py
rm *~ *.model *.buffer
python generalized_linear_model.py
python cross_validation.py
rm -rf *~ *.model *.buffer

39
demo/kaggle-higgs/higgs-cv.py Executable file
View File

@ -0,0 +1,39 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
label = train[:,32]
data = train[:,1:31]
weight = train[:,31]
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
num_round = 120
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
wtrain = dtrain.get_weight()
wtest = dtest.get_weight()
sum_weight = sum(wtrain) + sum(wtest)
wtrain *= sum_weight / sum(wtrain)
wtest *= sum_weight / sum(wtest)
dtrain.set_weight(wtrain)
dtest.set_weight(wtest)
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)

View File

@ -80,6 +80,9 @@ class EvalSet{
}
return result;
}
inline size_t Size(void) const {
return evals_.size();
}
private:
std::vector<const IEvaluator*> evals_;

View File

@ -244,7 +244,9 @@ class BoostLearner {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
}
evaluator_.AddEval(obj_->DefaultEvalMetric());
if (evaluator_.Size() == 0) {
evaluator_.AddEval(obj_->DefaultEvalMetric());
}
}
/*!
* \brief get un-transformed prediction

View File

@ -30,17 +30,17 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>

View File

@ -1,5 +1,8 @@
# Author: Tianqi Chen, Bing Xu
# module for xgboost
"""
xgboost: eXtreme Gradient Boosting library
Author: Tianqi Chen, Bing Xu
"""
import ctypes
import os
# optinally have scipy sparse, though not necessary
@ -7,7 +10,6 @@ import numpy as np
import sys
import numpy.ctypeslib
import scipy.sparse as scp
import random
# set this line correctly
if os.name == 'nt':
@ -17,15 +19,16 @@ else:
# load in xgboost library
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
# DMatrix functions
xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint)
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
# booster functions
xglib.XGBoosterCreate.restype = ctypes.c_void_p
xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
@ -64,6 +67,8 @@ class DMatrix:
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0))
elif isinstance(data, scp.csr_matrix):
self.__init_from_csr(data)
elif isinstance(data, scp.csc_matrix):
self.__init_from_csc(data)
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
self.__init_from_npy2d(data, missing)
else:
@ -86,6 +91,15 @@ class DMatrix:
(ctypes.c_float * len(csr.data))(*csr.data),
len(csr.indptr), len(csr.data)))
def __init_from_csc(self, csc):
"""convert data from csr matrix"""
assert len(csc.indices) == len(csc.data)
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC(
(ctypes.c_ulong * len(csc.indptr))(*csc.indptr),
(ctypes.c_uint * len(csc.indices))(*csc.indices),
(ctypes.c_float * len(csc.data))(*csc.data),
len(csc.indptr), len(csc.data)))
def __init_from_npy2d(self,mat,missing):
"""convert data from numpy matrix"""
data = numpy.array(mat.reshape(mat.size), dtype='float32')
@ -362,6 +376,7 @@ class Booster:
evals: list of tuple (DMatrix, string)
lists of items to be evaluated
it: int
current iteration
feval: function
custom evaluation function
Returns:
@ -391,7 +406,8 @@ class Booster:
the dmatrix storing the input
output_margin: bool
whether output raw margin value that is untransformed
ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees
ntree_limit: int
limit number of trees in prediction, default to 0, 0 means using all the trees
Returns:
numpy array of prediction
"""
@ -471,14 +487,15 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
params of booster
dtrain: DMatrix
data to be trained
num_boost_round: int
num_boost_round: int
num of round to be boosted
evals: list
list of items to be evaluated
watchlist: list of pairs (DMatrix, string)
list of items to be evaluated during training, this allows user to watch performance on validation set
obj: function
cutomized objective function
feval: function
cutomized evaluation function
Returns: Booster model trained
"""
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
for i in range(num_boost_round):
@ -513,11 +530,13 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
# run preprocessing on the data set if needed
if fpreproc is not None:
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
else:
tparam = param
plst = tparam.items() + [('eval_metric', itm) for itm in evals]
ret.append(CVPack(dtrain, dtest, plst))
return ret
def aggcv(rlist):
def aggcv(rlist, show_stdv=True):
"""
aggregate cross validation results
"""
@ -533,11 +552,14 @@ def aggcv(rlist):
cvmap[k].append(float(v))
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
v = np.array(v)
ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
if show_stdv:
ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
else:
ret += '\tcv-%s:%f' % (k, np.mean(v))
return ret
def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
obj = None, feval = None, fpreproc = None):
def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
""" cross validation with given paramaters
Args:
params: dict
@ -547,17 +569,29 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
num_boost_round: int
num of round to be boosted
nfold: int
folds to do cv
evals: list or
list of items to be evaluated
obj:
feval:
fpreproc: preprocessing function that takes dtrain, dtest,
number of folds to do cv
metrics: list of strings
evaluation metrics to be watched in cv
obj: function
custom objective function
feval: function
custom evaluation function
fpreproc: function
preprocessing function that takes dtrain, dtest,
param and return transformed version of dtrain, dtest, param
show_stdv: bool
whether display standard deviation
seed: int
seed used to generate the folds, this is passed to numpy.random.seed
Returns: list(string) of evaluation history
"""
cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc)
results = []
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
for i in range(num_boost_round):
for f in cvfolds:
f.update(i, obj)
res = aggcv([f.eval(i, feval) for f in cvfolds])
res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
sys.stderr.write(res+'\n')
results.append(res)
return results

View File

@ -14,6 +14,7 @@ using namespace std;
#include "../src/learner/learner-inl.hpp"
#include "../src/io/io.h"
#include "../src/utils/utils.h"
#include "../src/utils/matrix_csr.h"
#include "../src/io/simple_dmatrix-inl.hpp"
using namespace xgboost;
@ -102,6 +103,31 @@ extern "C"{
mat.info.info.num_row = nindptr - 1;
return p_mat;
}
XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
utils::SparseCSRMBuilder<RowBatch::Entry, false> builder(mat.row_ptr_, mat.row_data_);
builder.InitBudget();
bst_ulong ncol = nindptr - 1;
for (bst_ulong i = 0; i < ncol; ++i) {
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(indices[j]);
}
}
builder.InitStorage();
for (bst_ulong i = 0; i < ncol; ++i) {
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(indices[j], RowBatch::Entry(static_cast<bst_uint>(i), data[j]));
}
}
mat.info.info.num_row = mat.row_ptr_.size() - 1;
mat.info.info.num_col = static_cast<size_t>(ncol);
return p_mat;
}
void* XGDMatrixCreateFromMat(const float *data,
bst_ulong nrow,
bst_ulong ncol,

View File

@ -22,7 +22,7 @@ extern "C" {
* \return a loaded data matrix
*/
XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);
/*!
/*!
* \brief create a matrix content from csr format
* \param indptr pointer to row headers
* \param indices findex
@ -36,6 +36,20 @@ extern "C" {
const float *data,
bst_ulong nindptr,
bst_ulong nelem);
/*!
* \brief create a matrix content from CSC format
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \return created dmatrix
*/
XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem);
/*!
* \brief create matrix content from dense matrix
* \param data pointer to the data space