[R] Redesigned xgboost() interface skeleton (#10456)
--------- Co-authored-by: Michael Mayer <mayermichael79@gmail.com>
This commit is contained in:
parent
17c64300e3
commit
ab982e7873
@ -57,7 +57,8 @@ Suggests:
|
|||||||
igraph (>= 1.0.1),
|
igraph (>= 1.0.1),
|
||||||
float,
|
float,
|
||||||
titanic,
|
titanic,
|
||||||
RhpcBLASctl
|
RhpcBLASctl,
|
||||||
|
survival
|
||||||
Depends:
|
Depends:
|
||||||
R (>= 4.3.0)
|
R (>= 4.3.0)
|
||||||
Imports:
|
Imports:
|
||||||
|
|||||||
@ -13,6 +13,7 @@ S3method(predict,xgb.Booster)
|
|||||||
S3method(print,xgb.Booster)
|
S3method(print,xgb.Booster)
|
||||||
S3method(print,xgb.DMatrix)
|
S3method(print,xgb.DMatrix)
|
||||||
S3method(print,xgb.cv.synchronous)
|
S3method(print,xgb.cv.synchronous)
|
||||||
|
S3method(print,xgboost)
|
||||||
S3method(setinfo,xgb.Booster)
|
S3method(setinfo,xgb.Booster)
|
||||||
S3method(setinfo,xgb.DMatrix)
|
S3method(setinfo,xgb.DMatrix)
|
||||||
S3method(variable.names,xgb.Booster)
|
S3method(variable.names,xgb.Booster)
|
||||||
|
|||||||
@ -30,6 +30,40 @@ NVL <- function(x, val) {
|
|||||||
return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
|
return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.OBJECTIVES_NON_DEFAULT_MODE <- function() {
|
||||||
|
return(c("reg:logistic", "binary:logitraw", "multi:softmax"))
|
||||||
|
}
|
||||||
|
|
||||||
|
.BINARY_CLASSIF_OBJECTIVES <- function() {
|
||||||
|
return(c("binary:logistic", "binary:hinge"))
|
||||||
|
}
|
||||||
|
|
||||||
|
.MULTICLASS_CLASSIF_OBJECTIVES <- function() {
|
||||||
|
return("multi:softprob")
|
||||||
|
}
|
||||||
|
|
||||||
|
.SURVIVAL_RIGHT_CENSORING_OBJECTIVES <- function() { # nolint
|
||||||
|
return(c("survival:cox", "survival:aft"))
|
||||||
|
}
|
||||||
|
|
||||||
|
.SURVIVAL_ALL_CENSORING_OBJECTIVES <- function() { # nolint
|
||||||
|
return("survival:aft")
|
||||||
|
}
|
||||||
|
|
||||||
|
.REGRESSION_OBJECTIVES <- function() {
|
||||||
|
return(c(
|
||||||
|
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
|
||||||
|
"reg:absoluteerror", "reg:quantileerror", "count:poisson", "reg:gamma", "reg:tweedie"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
.MULTI_TARGET_OBJECTIVES <- function() {
|
||||||
|
return(c(
|
||||||
|
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
|
||||||
|
"reg:quantileerror", "reg:gamma"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Low-level functions for boosting --------------------------------------------
|
# Low-level functions for boosting --------------------------------------------
|
||||||
|
|||||||
@ -663,9 +663,8 @@ validate.features <- function(bst, newdata) {
|
|||||||
#' data(agaricus.train, package = "xgboost")
|
#' data(agaricus.train, package = "xgboost")
|
||||||
#' train <- agaricus.train
|
#' train <- agaricus.train
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = train$data,
|
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||||
#' label = train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
@ -767,9 +766,8 @@ xgb.attributes <- function(object) {
|
|||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#' train <- agaricus.train
|
#' train <- agaricus.train
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = train$data,
|
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||||
#' label = train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = nthread,
|
#' nthread = nthread,
|
||||||
@ -817,9 +815,8 @@ xgb.config <- function(object) {
|
|||||||
#' data(agaricus.train, package = "xgboost")
|
#' data(agaricus.train, package = "xgboost")
|
||||||
#' train <- agaricus.train
|
#' train <- agaricus.train
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = train$data,
|
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||||
#' label = train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
@ -1230,9 +1227,8 @@ xgb.is.same.Booster <- function(obj1, obj2) {
|
|||||||
#' data(agaricus.train, package = "xgboost")
|
#' data(agaricus.train, package = "xgboost")
|
||||||
#' train <- agaricus.train
|
#' train <- agaricus.train
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = train$data,
|
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||||
#' label = train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
|
|||||||
@ -853,36 +853,6 @@ xgb.DMatrix.hasinfo <- function(object, info) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# get dmatrix from data, label
|
|
||||||
# internal helper method
|
|
||||||
xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
|
|
||||||
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
|
|
||||||
if (is.null(label)) {
|
|
||||||
stop("label must be provided when data is a matrix")
|
|
||||||
}
|
|
||||||
dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
|
|
||||||
if (!is.null(weight)) {
|
|
||||||
setinfo(dtrain, "weight", weight)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!is.null(label)) {
|
|
||||||
warning("xgboost: label will be ignored.")
|
|
||||||
}
|
|
||||||
if (is.character(data)) {
|
|
||||||
data <- path.expand(data)
|
|
||||||
dtrain <- xgb.DMatrix(data[1])
|
|
||||||
} else if (inherits(data, "xgb.DMatrix")) {
|
|
||||||
dtrain <- data
|
|
||||||
} else if (inherits(data, "data.frame")) {
|
|
||||||
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
|
|
||||||
} else {
|
|
||||||
stop("xgboost: invalid input data")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return(dtrain)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#' Dimensions of xgb.DMatrix
|
#' Dimensions of xgb.DMatrix
|
||||||
#'
|
#'
|
||||||
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
|
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
|
||||||
|
|||||||
@ -29,8 +29,8 @@
|
|||||||
#' data(agaricus.test, package='xgboost')
|
#' data(agaricus.test, package='xgboost')
|
||||||
#' train <- agaricus.train
|
#' train <- agaricus.train
|
||||||
#' test <- agaricus.test
|
#' test <- agaricus.test
|
||||||
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
#' # save the model in file 'xgb.model.dump'
|
#' # save the model in file 'xgb.model.dump'
|
||||||
#' dump_path = file.path(tempdir(), 'model.dump')
|
#' dump_path = file.path(tempdir(), 'model.dump')
|
||||||
#' xgb.dump(bst, dump_path, with_stats = TRUE)
|
#' xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||||
|
|||||||
@ -46,9 +46,8 @@
|
|||||||
#' # binomial classification using "gbtree":
|
#' # binomial classification using "gbtree":
|
||||||
#' data(agaricus.train, package = "xgboost")
|
#' data(agaricus.train, package = "xgboost")
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
@ -59,9 +58,8 @@
|
|||||||
#' xgb.importance(model = bst)
|
#' xgb.importance(model = bst)
|
||||||
#'
|
#'
|
||||||
#' # binomial classification using "gblinear":
|
#' # binomial classification using "gblinear":
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' booster = "gblinear",
|
#' booster = "gblinear",
|
||||||
#' eta = 0.3,
|
#' eta = 0.3,
|
||||||
#' nthread = 1,
|
#' nthread = 1,
|
||||||
@ -73,9 +71,11 @@
|
|||||||
#' # multiclass classification using "gbtree":
|
#' # multiclass classification using "gbtree":
|
||||||
#' nclass <- 3
|
#' nclass <- 3
|
||||||
#' nrounds <- 10
|
#' nrounds <- 10
|
||||||
#' mbst <- xgboost(
|
#' mbst <- xgb.train(
|
||||||
#' data = as.matrix(iris[, -5]),
|
#' data = xgb.DMatrix(
|
||||||
#' label = as.numeric(iris$Species) - 1,
|
#' as.matrix(iris[, -5]),
|
||||||
|
#' label = as.numeric(iris$Species) - 1
|
||||||
|
#' ),
|
||||||
#' max_depth = 3,
|
#' max_depth = 3,
|
||||||
#' eta = 0.2,
|
#' eta = 0.2,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
@ -99,9 +99,11 @@
|
|||||||
#' )
|
#' )
|
||||||
#'
|
#'
|
||||||
#' # multiclass classification using "gblinear":
|
#' # multiclass classification using "gblinear":
|
||||||
#' mbst <- xgboost(
|
#' mbst <- xgb.train(
|
||||||
#' data = scale(as.matrix(iris[, -5])),
|
#' data = xgb.DMatrix(
|
||||||
#' label = as.numeric(iris$Species) - 1,
|
#' scale(as.matrix(iris[, -5])),
|
||||||
|
#' label = as.numeric(iris$Species) - 1
|
||||||
|
#' ),
|
||||||
#' booster = "gblinear",
|
#' booster = "gblinear",
|
||||||
#' eta = 0.2,
|
#' eta = 0.2,
|
||||||
#' nthread = 1,
|
#' nthread = 1,
|
||||||
|
|||||||
@ -43,9 +43,8 @@
|
|||||||
#' nthread <- 1
|
#' nthread <- 1
|
||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = nthread,
|
#' nthread = nthread,
|
||||||
|
|||||||
@ -48,9 +48,8 @@
|
|||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#'
|
#'
|
||||||
#' ## Change max_depth to a higher number to get a more significant result
|
#' ## Change max_depth to a higher number to get a more significant result
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 6,
|
#' max_depth = 6,
|
||||||
#' nthread = nthread,
|
#' nthread = nthread,
|
||||||
#' nrounds = 50,
|
#' nrounds = 50,
|
||||||
|
|||||||
@ -51,9 +51,8 @@
|
|||||||
#' nthread <- 2
|
#' nthread <- 2
|
||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 3,
|
#' max_depth = 3,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = nthread,
|
#' nthread = nthread,
|
||||||
|
|||||||
@ -35,9 +35,8 @@
|
|||||||
#' nthread <- 2
|
#' nthread <- 2
|
||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 15,
|
#' max_depth = 15,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = nthread,
|
#' nthread = nthread,
|
||||||
|
|||||||
@ -82,9 +82,8 @@
|
|||||||
#' data.table::setDTthreads(nthread)
|
#' data.table::setDTthreads(nthread)
|
||||||
#' nrounds <- 20
|
#' nrounds <- 20
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||||
#' agaricus.train$label,
|
|
||||||
#' nrounds = nrounds,
|
#' nrounds = nrounds,
|
||||||
#' eta = 0.1,
|
#' eta = 0.1,
|
||||||
#' max_depth = 3,
|
#' max_depth = 3,
|
||||||
@ -108,9 +107,8 @@
|
|||||||
#' set.seed(123)
|
#' set.seed(123)
|
||||||
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||||
#'
|
#'
|
||||||
#' mbst <- xgboost(
|
#' mbst <- xgb.train(
|
||||||
#' data = x,
|
#' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
|
||||||
#' label = as.numeric(iris$Species) - 1,
|
|
||||||
#' nrounds = nrounds,
|
#' nrounds = nrounds,
|
||||||
#' max_depth = 2,
|
#' max_depth = 2,
|
||||||
#' eta = 0.3,
|
#' eta = 0.3,
|
||||||
|
|||||||
@ -68,9 +68,8 @@
|
|||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package = "xgboost")
|
#' data(agaricus.train, package = "xgboost")
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(
|
#' bst <- xgb.train(
|
||||||
#' data = agaricus.train$data,
|
#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||||
#' label = agaricus.train$label,
|
|
||||||
#' max_depth = 3,
|
#' max_depth = 3,
|
||||||
#' eta = 1,
|
#' eta = 1,
|
||||||
#' nthread = 2,
|
#' nthread = 2,
|
||||||
|
|||||||
@ -182,12 +182,6 @@
|
|||||||
#' as R attributes, and thus do not get saved when using XGBoost's own serializaters like
|
#' as R attributes, and thus do not get saved when using XGBoost's own serializaters like
|
||||||
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#' @param label vector of response values. Should not be provided when data is
|
|
||||||
#' a local data file name or an \code{xgb.DMatrix}.
|
|
||||||
#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
|
|
||||||
#' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
|
|
||||||
#' This parameter is only used when input is a dense matrix.
|
|
||||||
#' @param weight a vector indicating the weight for each row of the input.
|
|
||||||
#'
|
#'
|
||||||
#' @return
|
#' @return
|
||||||
#' An object of class \code{xgb.Booster}.
|
#' An object of class \code{xgb.Booster}.
|
||||||
@ -328,12 +322,10 @@
|
|||||||
#' early_stopping_rounds = 3)
|
#' early_stopping_rounds = 3)
|
||||||
#'
|
#'
|
||||||
#' ## An 'xgboost' interface example:
|
#' ## An 'xgboost' interface example:
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
#' bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
|
||||||
#' max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
|
#' params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
|
||||||
#' objective = "binary:logistic")
|
|
||||||
#' pred <- predict(bst, agaricus.test$data)
|
#' pred <- predict(bst, agaricus.test$data)
|
||||||
#'
|
#'
|
||||||
#' @rdname xgb.train
|
|
||||||
#' @export
|
#' @export
|
||||||
xgb.train <- function(params = list(), data, nrounds, evals = list(),
|
xgb.train <- function(params = list(), data, nrounds, evals = list(),
|
||||||
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
|
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -16,29 +16,28 @@ class(train$data)
|
|||||||
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
|
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
|
||||||
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
|
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
|
||||||
print("Training xgboost with sparseMatrix")
|
print("Training xgboost with sparseMatrix")
|
||||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2,
|
bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)),
|
||||||
nthread = 2, objective = "binary:logistic")
|
params = list(max_depth = 2, eta = 1),
|
||||||
|
nrounds = 2, nthread = 2)
|
||||||
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||||
print("Training xgboost with Matrix")
|
print("Training xgboost with Matrix")
|
||||||
bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2,
|
bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)),
|
||||||
nthread = 2, objective = "binary:logistic")
|
params = list(max_depth = 2, eta = 1),
|
||||||
|
nrounds = 2, nthread = 2)
|
||||||
|
|
||||||
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
|
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
|
||||||
print("Training xgboost with xgb.DMatrix")
|
print("Training xgboost with xgb.DMatrix")
|
||||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
|
params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic")
|
||||||
objective = "binary:logistic")
|
bst <- xgb.train(data = dtrain, params = params, nrounds = 2)
|
||||||
|
|
||||||
# Verbose = 0,1,2
|
# Verbose = 0,1,2
|
||||||
print("Train xgboost with verbose 0, no message")
|
print("Train xgboost with verbose 0, no message")
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0)
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 0)
|
|
||||||
print("Train xgboost with verbose 1, print evaluation metric")
|
print("Train xgboost with verbose 1, print evaluation metric")
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1)
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 1)
|
|
||||||
print("Train xgboost with verbose 2, also print information about tree")
|
print("Train xgboost with verbose 2, also print information about tree")
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2)
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 2)
|
|
||||||
|
|
||||||
# you can also specify data as file path to a LIBSVM format input
|
# you can also specify data as file path to a LIBSVM format input
|
||||||
# since we do not have this file with us, the following line is just for illustration
|
# since we do not have this file with us, the following line is just for illustration
|
||||||
|
|||||||
@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
|
|||||||
data(agaricus.train, package = "xgboost")
|
data(agaricus.train, package = "xgboost")
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = train$data,
|
data = xgb.DMatrix(train$data, label = train$label),
|
||||||
label = train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
|
|||||||
@ -64,9 +64,8 @@ example of these behaviors).
|
|||||||
data(agaricus.train, package = "xgboost")
|
data(agaricus.train, package = "xgboost")
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = train$data,
|
data = xgb.DMatrix(train$data, label = train$label),
|
||||||
label = train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
|
|||||||
@ -35,9 +35,8 @@ nthread <- 1
|
|||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = train$data,
|
data = xgb.DMatrix(train$data, label = train$label),
|
||||||
label = train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = nthread,
|
nthread = nthread,
|
||||||
|
|||||||
@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost')
|
|||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
test <- agaricus.test
|
test <- agaricus.test
|
||||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
# save the model in file 'xgb.model.dump'
|
# save the model in file 'xgb.model.dump'
|
||||||
dump_path = file.path(tempdir(), 'model.dump')
|
dump_path = file.path(tempdir(), 'model.dump')
|
||||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||||
|
|||||||
@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
|
|||||||
# binomial classification using "gbtree":
|
# binomial classification using "gbtree":
|
||||||
data(agaricus.train, package = "xgboost")
|
data(agaricus.train, package = "xgboost")
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
@ -83,9 +82,8 @@ bst <- xgboost(
|
|||||||
xgb.importance(model = bst)
|
xgb.importance(model = bst)
|
||||||
|
|
||||||
# binomial classification using "gblinear":
|
# binomial classification using "gblinear":
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
booster = "gblinear",
|
booster = "gblinear",
|
||||||
eta = 0.3,
|
eta = 0.3,
|
||||||
nthread = 1,
|
nthread = 1,
|
||||||
@ -97,9 +95,11 @@ xgb.importance(model = bst)
|
|||||||
# multiclass classification using "gbtree":
|
# multiclass classification using "gbtree":
|
||||||
nclass <- 3
|
nclass <- 3
|
||||||
nrounds <- 10
|
nrounds <- 10
|
||||||
mbst <- xgboost(
|
mbst <- xgb.train(
|
||||||
data = as.matrix(iris[, -5]),
|
data = xgb.DMatrix(
|
||||||
label = as.numeric(iris$Species) - 1,
|
as.matrix(iris[, -5]),
|
||||||
|
label = as.numeric(iris$Species) - 1
|
||||||
|
),
|
||||||
max_depth = 3,
|
max_depth = 3,
|
||||||
eta = 0.2,
|
eta = 0.2,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
@ -123,9 +123,11 @@ xgb.importance(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# multiclass classification using "gblinear":
|
# multiclass classification using "gblinear":
|
||||||
mbst <- xgboost(
|
mbst <- xgb.train(
|
||||||
data = scale(as.matrix(iris[, -5])),
|
data = xgb.DMatrix(
|
||||||
label = as.numeric(iris$Species) - 1,
|
scale(as.matrix(iris[, -5])),
|
||||||
|
label = as.numeric(iris$Species) - 1
|
||||||
|
),
|
||||||
booster = "gblinear",
|
booster = "gblinear",
|
||||||
eta = 0.2,
|
eta = 0.2,
|
||||||
nthread = 1,
|
nthread = 1,
|
||||||
|
|||||||
@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost")
|
|||||||
nthread <- 1
|
nthread <- 1
|
||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = nthread,
|
nthread = nthread,
|
||||||
|
|||||||
@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero.
|
|||||||
data(agaricus.train, package = "xgboost")
|
data(agaricus.train, package = "xgboost")
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = train$data,
|
data = xgb.DMatrix(train$data, label = train$label),
|
||||||
label = train$label,
|
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
|
|||||||
@ -73,9 +73,8 @@ nthread <- 2
|
|||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
|
|
||||||
## Change max_depth to a higher number to get a more significant result
|
## Change max_depth to a higher number to get a more significant result
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 6,
|
max_depth = 6,
|
||||||
nthread = nthread,
|
nthread = nthread,
|
||||||
nrounds = 50,
|
nrounds = 50,
|
||||||
|
|||||||
@ -88,9 +88,8 @@ data(agaricus.train)
|
|||||||
nthread <- 2
|
nthread <- 2
|
||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 3,
|
max_depth = 3,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = nthread,
|
nthread = nthread,
|
||||||
|
|||||||
@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost")
|
|||||||
nthread <- 2
|
nthread <- 2
|
||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 15,
|
max_depth = 15,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = nthread,
|
nthread = nthread,
|
||||||
|
|||||||
@ -135,9 +135,8 @@ nthread <- 1
|
|||||||
data.table::setDTthreads(nthread)
|
data.table::setDTthreads(nthread)
|
||||||
nrounds <- 20
|
nrounds <- 20
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||||
agaricus.train$label,
|
|
||||||
nrounds = nrounds,
|
nrounds = nrounds,
|
||||||
eta = 0.1,
|
eta = 0.1,
|
||||||
max_depth = 3,
|
max_depth = 3,
|
||||||
@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5])
|
|||||||
set.seed(123)
|
set.seed(123)
|
||||||
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||||
|
|
||||||
mbst <- xgboost(
|
mbst <- xgb.train(
|
||||||
data = x,
|
data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
|
||||||
label = as.numeric(iris$Species) - 1,
|
|
||||||
nrounds = nrounds,
|
nrounds = nrounds,
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
eta = 0.3,
|
eta = 0.3,
|
||||||
|
|||||||
@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back
|
|||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package = "xgboost")
|
data(agaricus.train, package = "xgboost")
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = agaricus.train$data,
|
data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||||
label = agaricus.train$label,
|
|
||||||
max_depth = 3,
|
max_depth = 3,
|
||||||
eta = 1,
|
eta = 1,
|
||||||
nthread = 2,
|
nthread = 2,
|
||||||
|
|||||||
@ -1,8 +1,7 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.train.R, R/xgboost.R
|
% Please edit documentation in R/xgb.train.R
|
||||||
\name{xgb.train}
|
\name{xgb.train}
|
||||||
\alias{xgb.train}
|
\alias{xgb.train}
|
||||||
\alias{xgboost}
|
|
||||||
\title{eXtreme Gradient Boosting Training}
|
\title{eXtreme Gradient Boosting Training}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.train(
|
xgb.train(
|
||||||
@ -22,24 +21,6 @@ xgb.train(
|
|||||||
callbacks = list(),
|
callbacks = list(),
|
||||||
...
|
...
|
||||||
)
|
)
|
||||||
|
|
||||||
xgboost(
|
|
||||||
data = NULL,
|
|
||||||
label = NULL,
|
|
||||||
missing = NA,
|
|
||||||
weight = NULL,
|
|
||||||
params = list(),
|
|
||||||
nrounds,
|
|
||||||
verbose = 1,
|
|
||||||
print_every_n = 1L,
|
|
||||||
early_stopping_rounds = NULL,
|
|
||||||
maximize = NULL,
|
|
||||||
save_period = NULL,
|
|
||||||
save_name = "xgboost.model",
|
|
||||||
xgb_model = NULL,
|
|
||||||
callbacks = list(),
|
|
||||||
...
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters. The complete list of parameters is
|
\item{params}{the list of parameters. The complete list of parameters is
|
||||||
@ -240,15 +221,6 @@ to customize the training process.
|
|||||||
}\if{html}{\out{</div>}}}
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
|
|
||||||
\item{label}{vector of response values. Should not be provided when data is
|
|
||||||
a local data file name or an \code{xgb.DMatrix}.}
|
|
||||||
|
|
||||||
\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
|
|
||||||
by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
|
|
||||||
This parameter is only used when input is a dense matrix.}
|
|
||||||
|
|
||||||
\item{weight}{a vector indicating the weight for each row of the input.}
|
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
An object of class \code{xgb.Booster}.
|
An object of class \code{xgb.Booster}.
|
||||||
@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
|
|||||||
early_stopping_rounds = 3)
|
early_stopping_rounds = 3)
|
||||||
|
|
||||||
## An 'xgboost' interface example:
|
## An 'xgboost' interface example:
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
|
||||||
max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
|
params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
|
||||||
objective = "binary:logistic")
|
|
||||||
pred <- predict(bst, agaricus.test$data)
|
pred <- predict(bst, agaricus.test$data)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
213
R-package/man/xgboost.Rd
Normal file
213
R-package/man/xgboost.Rd
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgboost.R
|
||||||
|
\name{xgboost}
|
||||||
|
\alias{xgboost}
|
||||||
|
\title{Fit XGBoost Model}
|
||||||
|
\usage{
|
||||||
|
xgboost(
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
objective = NULL,
|
||||||
|
nrounds = 100L,
|
||||||
|
weights = NULL,
|
||||||
|
verbosity = 0L,
|
||||||
|
nthreads = parallel::detectCores(),
|
||||||
|
seed = 0L,
|
||||||
|
monotone_constraints = NULL,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
base_margin = NULL,
|
||||||
|
...
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{x}{The features / covariates. Can be passed as:\itemize{
|
||||||
|
\item A numeric or integer `matrix`.
|
||||||
|
\item A `data.frame`, in which all columns are one of the following types:\itemize{
|
||||||
|
\item `numeric`
|
||||||
|
\item `integer`
|
||||||
|
\item `logical`
|
||||||
|
\item `factor`
|
||||||
|
}
|
||||||
|
|
||||||
|
Columns of `factor` type will be assumed to be categorical, while other column types will
|
||||||
|
be assumed to be numeric.
|
||||||
|
\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
|
||||||
|
}
|
||||||
|
|
||||||
|
Note that categorical features are only supported for `data.frame` inputs, and are automatically
|
||||||
|
determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
|
||||||
|
variants that would allow something like categorical features on sparse matrices.}
|
||||||
|
|
||||||
|
\item{y}{The response variable. Allowed values are:\itemize{
|
||||||
|
\item A numeric or integer vector (for regression tasks).
|
||||||
|
\item A factor or character vector (for binary and multi-class classification tasks).
|
||||||
|
\item A logical (boolean) vector (for binary classification tasks).
|
||||||
|
\item A numeric or integer matrix or `data.frame` with numeric/integer columns
|
||||||
|
(for multi-task regression tasks).
|
||||||
|
\item A `Surv` object from the `survival` package (for survival tasks).
|
||||||
|
}
|
||||||
|
|
||||||
|
If `objective` is `NULL`, the right task will be determined automatically based on
|
||||||
|
the class of `y`.
|
||||||
|
|
||||||
|
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
|
||||||
|
can only be used with classification objectives and vice-versa.
|
||||||
|
|
||||||
|
For binary classification, the last factor level of `y` will be used as the "positive"
|
||||||
|
class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
|
||||||
|
class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
|
||||||
|
set as the last level.}
|
||||||
|
|
||||||
|
\item{objective}{Optimization objective to minimize based on the supplied data, to be passed
|
||||||
|
by name as a string / character (e.g. `reg:absoluteerror`). See the
|
||||||
|
\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
|
||||||
|
Learning Task Parameters} page for more detailed information on allowed values.
|
||||||
|
|
||||||
|
If `NULL` (the default), will be automatically determined from `y` according to the following
|
||||||
|
logic:\itemize{
|
||||||
|
\item If `y` is a factor with 2 levels, will use `binary:logistic`.
|
||||||
|
\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
|
||||||
|
will be determined automatically, should not be passed under `params`).
|
||||||
|
\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
|
||||||
|
the only types supported are left / right / interval censored).
|
||||||
|
\item Otherwise, will use `reg:squarederror`.
|
||||||
|
}
|
||||||
|
|
||||||
|
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
|
||||||
|
can only be used with classification objectives and vice-versa.
|
||||||
|
|
||||||
|
Note that not all possible `objective` values supported by the core XGBoost library are allowed
|
||||||
|
here - for example, objectives which are a variation of another but with a different default
|
||||||
|
prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
|
||||||
|
ranking objectives, nor custom objectives at the moment.}
|
||||||
|
|
||||||
|
\item{nrounds}{Number of boosting iterations / rounds.
|
||||||
|
|
||||||
|
Note that the number of default boosting rounds here is not automatically tuned, and different
|
||||||
|
problems will have vastly different optimal numbers of boosting rounds.}
|
||||||
|
|
||||||
|
\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
|
||||||
|
will have the same weight.
|
||||||
|
|
||||||
|
If not `NULL`, should be passed as a numeric vector with length matching to the number of
|
||||||
|
rows in `x`.}
|
||||||
|
|
||||||
|
\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
|
||||||
|
2 (info), and 3 (debug).}
|
||||||
|
|
||||||
|
\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.}
|
||||||
|
|
||||||
|
\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random
|
||||||
|
number using R's PRNG system to use as seed.}
|
||||||
|
|
||||||
|
\item{monotone_constraints}{Optional monotonicity constraints for features.
|
||||||
|
|
||||||
|
Can be passed either as a named list (when `x` has column names), or as a vector. If passed
|
||||||
|
as a vector and `x` has column names, will try to match the elements by name.
|
||||||
|
|
||||||
|
A value of `+1` for a given feature makes the model predictions / scores constrained to be
|
||||||
|
a monotonically increasing function of that feature (that is, as the value of the feature
|
||||||
|
increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
|
||||||
|
decreasing function. A value of zero imposes no constraint.
|
||||||
|
|
||||||
|
The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
|
||||||
|
case the columns that are not referred to in `monotone_constraints` will be assumed to have
|
||||||
|
a value of zero (no constraint imposed on the model for those features).
|
||||||
|
|
||||||
|
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
|
||||||
|
Monotonic Constraints} for a more detailed explanation.}
|
||||||
|
|
||||||
|
\item{interaction_constraints}{Constraints for interaction representing permitted interactions.
|
||||||
|
The constraints must be specified in the form of a list of vectors referencing columns in the
|
||||||
|
data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
|
||||||
|
starting at 1 - i.e. the first sublist references the first and second columns) or
|
||||||
|
`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
|
||||||
|
columns by names), where each vector is a group of indices of features that are allowed to
|
||||||
|
interact with each other.
|
||||||
|
|
||||||
|
See the tutorial
|
||||||
|
\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
|
||||||
|
Feature Interaction Constraints} for more information.}
|
||||||
|
|
||||||
|
\item{feature_weights}{Feature weights for column sampling.
|
||||||
|
|
||||||
|
Can be passed either as a vector with length matching to columns of `x`, or as a named
|
||||||
|
list (only if `x` has column names) with names matching to columns of 'x'. If it is a
|
||||||
|
named vector, will try to match the entries to column names of `x` by name.
|
||||||
|
|
||||||
|
If `NULL` (the default), all columns will have the same weight.}
|
||||||
|
|
||||||
|
\item{base_margin}{Base margin used for boosting from existing model.
|
||||||
|
|
||||||
|
If passing it, will start the gradient boosting procedure from the scores that are provided
|
||||||
|
here - for example, one can pass the raw scores from a previous model, or some per-observation
|
||||||
|
offset, or similar.
|
||||||
|
|
||||||
|
Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
|
||||||
|
with the same number of rows as `x` and number of columns corresponding to number of optimization
|
||||||
|
targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
|
||||||
|
it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
|
||||||
|
number of columns matching to number of classes in the data).
|
||||||
|
|
||||||
|
Note that, if it contains more than one column, then columns will not be matched by name to
|
||||||
|
the corresponding `y` - `base_margin` should have the same column order that the model will use
|
||||||
|
(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
|
||||||
|
`levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
|
||||||
|
|
||||||
|
If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
|
||||||
|
added (controllable through parameter `base_score` instead) when `base_margin` is not passed.}
|
||||||
|
|
||||||
|
\item{...}{Other training parameters. See the online documentation
|
||||||
|
\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
|
||||||
|
details about possible values and what they do.
|
||||||
|
|
||||||
|
Note that not all possible values from the core XGBoost library are allowed as `params` for
|
||||||
|
'xgboost()' - in particular, values which require an already-fitted booster object (such as
|
||||||
|
`process_type`) are not accepted here.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
|
||||||
|
`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
|
||||||
|
additional attribute `metadata` containing information which is used for formatting prediction
|
||||||
|
outputs, such as class names for classification problems.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
|
||||||
|
|
||||||
|
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
|
||||||
|
Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
|
||||||
|
|
||||||
|
This function is intended to provide a more user-friendly interface for XGBoost that follows
|
||||||
|
R's conventions for model fitting and predictions, but which doesn't expose all of the
|
||||||
|
possible functionalities of the core XGBoost library.
|
||||||
|
|
||||||
|
See \link{xgb.train} for a more flexible low-level alternative which is similar across different
|
||||||
|
language bindings of XGBoost and which exposes the full library's functionalities.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
For package authors using `xgboost` as a dependency, it is highly recommended to use
|
||||||
|
\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
|
||||||
|
and performs fewer data conversions and copies along the way.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
library(xgboost)
|
||||||
|
data(mtcars)
|
||||||
|
|
||||||
|
# Fit a small regression model on the mtcars data
|
||||||
|
model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
|
||||||
|
predict(model_regression, mtcars, validate_features = TRUE)
|
||||||
|
|
||||||
|
# Task objective is determined automatically according to the type of 'y'
|
||||||
|
data(iris)
|
||||||
|
model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
|
||||||
|
predict(model_classif, iris, validate_features = TRUE)
|
||||||
|
}
|
||||||
|
\references{
|
||||||
|
\itemize{
|
||||||
|
\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
|
||||||
|
Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
|
||||||
|
data mining. 2016.
|
||||||
|
\item \url{https://xgboost.readthedocs.io/en/stable/}
|
||||||
|
}
|
||||||
|
}
|
||||||
623
R-package/tests/testthat/test_xgboost.R
Normal file
623
R-package/tests/testthat/test_xgboost.R
Normal file
@ -0,0 +1,623 @@
|
|||||||
|
library(survival)
|
||||||
|
library(data.table)
|
||||||
|
|
||||||
|
test_that("Auto determine objective", {
|
||||||
|
y_num <- seq(1, 10)
|
||||||
|
res_num <- process.y.margin.and.objective(y_num, NULL, NULL, NULL)
|
||||||
|
expect_equal(res_num$params$objective, "reg:squarederror")
|
||||||
|
|
||||||
|
y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
|
||||||
|
res_bin <- process.y.margin.and.objective(y_bin, NULL, NULL, NULL)
|
||||||
|
expect_equal(res_bin$params$objective, "binary:logistic")
|
||||||
|
|
||||||
|
y_multi <- factor(c('a', 'b', 'a', 'b', 'c'), c('a', 'b', 'c'))
|
||||||
|
res_multi <- process.y.margin.and.objective(y_multi, NULL, NULL, NULL)
|
||||||
|
expect_equal(res_multi$params$objective, "multi:softprob")
|
||||||
|
|
||||||
|
y_surv <- Surv(1:10, rep(c(0, 1), 5), type = "right")
|
||||||
|
res_surv <- process.y.margin.and.objective(y_surv, NULL, NULL, NULL)
|
||||||
|
expect_equal(res_surv$params$objective, "survival:aft")
|
||||||
|
|
||||||
|
y_multicol <- matrix(seq(1, 20), nrow = 5)
|
||||||
|
res_multicol <- process.y.margin.and.objective(y_multicol, NULL, NULL, NULL)
|
||||||
|
expect_equal(res_multicol$params$objective, "reg:squarederror")
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process vectors", {
|
||||||
|
y <- seq(1, 10)
|
||||||
|
for (y_inp in list(as.integer(y), as.numeric(y))) {
|
||||||
|
res <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$label,
|
||||||
|
y
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$objective,
|
||||||
|
"reg:pseudohubererror"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process factors", {
|
||||||
|
y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_bin, NULL, "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
for (bin_obj in c("binary:logistic", "binary:hinge")) {
|
||||||
|
for (y_inp in list(y_bin, as.ordered(y_bin))) {
|
||||||
|
res_bin <- process.y.margin.and.objective(y_inp, NULL, bin_obj, NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$dmatrix_args$label,
|
||||||
|
c(0, 1, 0, 1)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$metadata$y_levels,
|
||||||
|
c('a', 'b')
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$params$objective,
|
||||||
|
bin_obj
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
y_bin2 <- factor(c(1, 0, 1, 0), c(1, 0))
|
||||||
|
res_bin <- process.y.margin.and.objective(y_bin2, NULL, "binary:logistic", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$dmatrix_args$label,
|
||||||
|
c(0, 1, 0, 1)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$metadata$y_levels,
|
||||||
|
c("1", "0")
|
||||||
|
)
|
||||||
|
|
||||||
|
y_bin3 <- c(TRUE, FALSE, TRUE)
|
||||||
|
res_bin <- process.y.margin.and.objective(y_bin3, NULL, "binary:logistic", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$dmatrix_args$label,
|
||||||
|
c(1, 0, 1)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_bin$metadata$y_levels,
|
||||||
|
c("FALSE", "TRUE")
|
||||||
|
)
|
||||||
|
|
||||||
|
y_multi <- factor(c('a', 'b', 'c', 'd', 'a', 'b'), c('a', 'b', 'c', 'd'))
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
|
||||||
|
})
|
||||||
|
res_multi <- process.y.margin.and.objective(y_multi, NULL, "multi:softprob", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$dmatrix_args$label,
|
||||||
|
c(0, 1, 2, 3, 0, 1)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$metadata$y_levels,
|
||||||
|
c('a', 'b', 'c', 'd')
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$params$num_class,
|
||||||
|
4
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$params$objective,
|
||||||
|
"multi:softprob"
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process survival objects", {
|
||||||
|
data(cancer, package = "survival")
|
||||||
|
y_right <- Surv(cancer$time, cancer$status - 1, type = "right")
|
||||||
|
res_cox <- process.y.margin.and.objective(y_right, NULL, "survival:cox", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_cox$dmatrix_args$label,
|
||||||
|
ifelse(cancer$status == 2, cancer$time, -cancer$time)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_cox$params$objective,
|
||||||
|
"survival:cox"
|
||||||
|
)
|
||||||
|
|
||||||
|
res_aft <- process.y.margin.and.objective(y_right, NULL, "survival:aft", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_lower_bound,
|
||||||
|
cancer$time
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_upper_bound,
|
||||||
|
ifelse(cancer$status == 2, cancer$time, Inf)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$params$objective,
|
||||||
|
"survival:aft"
|
||||||
|
)
|
||||||
|
|
||||||
|
y_left <- Surv(seq(1, 4), c(1, 0, 1, 0), type = "left")
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_left, NULL, "survival:cox", NULL)
|
||||||
|
})
|
||||||
|
res_aft <- process.y.margin.and.objective(y_left, NULL, "survival:aft", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_lower_bound,
|
||||||
|
c(1, 0, 3, 0)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_upper_bound,
|
||||||
|
seq(1, 4)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$params$objective,
|
||||||
|
"survival:aft"
|
||||||
|
)
|
||||||
|
|
||||||
|
y_interval <- Surv(
|
||||||
|
time = c(1, 5, 2, 10, 3),
|
||||||
|
time2 = c(2, 5, 2.5, 10, 3),
|
||||||
|
event = c(3, 1, 3, 0, 2),
|
||||||
|
type = "interval"
|
||||||
|
)
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_interval, NULL, "survival:cox", NULL)
|
||||||
|
})
|
||||||
|
res_aft <- process.y.margin.and.objective(y_interval, NULL, "survival:aft", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_lower_bound,
|
||||||
|
c(1, 5, 2, 10, 0)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$dmatrix_args$label_upper_bound,
|
||||||
|
c(2, 5, 2.5, Inf, 3)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_aft$params$objective,
|
||||||
|
"survival:aft"
|
||||||
|
)
|
||||||
|
|
||||||
|
y_interval_neg <- Surv(
|
||||||
|
time = c(1, -5, 2, 10, 3),
|
||||||
|
time2 = c(2, -5, 2.5, 10, 3),
|
||||||
|
event = c(3, 1, 3, 0, 2),
|
||||||
|
type = "interval"
|
||||||
|
)
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_interval_neg, NULL, "survival:aft", NULL)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process multi-target", {
|
||||||
|
data(mtcars)
|
||||||
|
y_multi <- data.frame(
|
||||||
|
y1 = mtcars$mpg,
|
||||||
|
y2 = mtcars$mpg ^ 2
|
||||||
|
)
|
||||||
|
for (y_inp in list(y_multi, as.matrix(y_multi), data.table::as.data.table(y_multi))) {
|
||||||
|
res_multi <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$dmatrix_args$label,
|
||||||
|
as.matrix(y_multi)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$metadata$y_names,
|
||||||
|
c("y1", "y2")
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res_multi$params$objective,
|
||||||
|
"reg:pseudohubererror"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_multi, NULL, "count:poisson", NULL)
|
||||||
|
})
|
||||||
|
|
||||||
|
y_bad <- data.frame(
|
||||||
|
c1 = seq(1, 3),
|
||||||
|
c2 = rep(as.Date("2024-01-01"), 3)
|
||||||
|
)
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
|
||||||
|
y_bad <- data.frame(
|
||||||
|
c1 = seq(1, 3),
|
||||||
|
c2 = factor(c('a', 'b', 'a'), c('a', 'b'))
|
||||||
|
)
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
|
||||||
|
y_bad <- seq(1, 20)
|
||||||
|
dim(y_bad) <- c(5, 2, 2)
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process base_margin", {
|
||||||
|
y <- seq(101, 110)
|
||||||
|
bm_good <- seq(1, 10)
|
||||||
|
for (bm in list(bm_good, as.matrix(bm_good), as.data.frame(as.matrix(bm_good)))) {
|
||||||
|
res <- process.y.margin.and.objective(y, bm, "reg:squarederror", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$base_margin,
|
||||||
|
seq(1, 10)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, 5, "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, seq(1, 5), "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, matrix(seq(1, 20), ncol = 2), "reg:squarederror", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(
|
||||||
|
y,
|
||||||
|
as.data.frame(matrix(seq(1, 20), ncol = 2)),
|
||||||
|
"reg:squarederror",
|
||||||
|
NULL
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
y <- factor(c('a', 'b', 'c', 'a'))
|
||||||
|
bm_good <- matrix(seq(1, 12), ncol = 3)
|
||||||
|
for (bm in list(bm_good, as.data.frame(bm_good))) {
|
||||||
|
res <- process.y.margin.and.objective(y, bm, "multi:softprob", NULL)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$base_margin |> unname(),
|
||||||
|
matrix(seq(1, 12), ncol = 3)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, as.numeric(bm_good), "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, 5, "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, 1], "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[c(1, 2), ], "multi:softprob", NULL)
|
||||||
|
})
|
||||||
|
|
||||||
|
y <- seq(101, 110)
|
||||||
|
bm_good <- matrix(seq(1, 30), ncol = 3)
|
||||||
|
params <- list(quantile_alpha = c(0.1, 0.5, 0.9))
|
||||||
|
for (bm in list(bm_good, as.data.frame(bm_good))) {
|
||||||
|
res <- process.y.margin.and.objective(y, bm, "reg:quantileerror", params)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$base_margin |> unname(),
|
||||||
|
matrix(seq(1, 30), ncol = 3)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, as.numeric(bm_good), "reg:quantileerror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, 5, "reg:quantileerror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, 1], "reg:quantileerror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:quantileerror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:quantileerror", params)
|
||||||
|
})
|
||||||
|
|
||||||
|
y <- matrix(seq(101, 130), ncol = 3)
|
||||||
|
for (bm in list(bm_good, as.data.frame(bm_good))) {
|
||||||
|
res <- process.y.margin.and.objective(y, bm, "reg:squarederror", params)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$base_margin |> unname(),
|
||||||
|
matrix(seq(1, 30), ncol = 3)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, as.numeric(bm_good), "reg:squarederror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, 5, "reg:squarederror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, 1], "reg:squarederror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:squarederror", params)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:squarederror", params)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process monotone constraints", {
|
||||||
|
data(iris)
|
||||||
|
mc_list <- list(Sepal.Width = 1)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = mc_list,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$monotone_constraints,
|
||||||
|
c(0, 1, 0, 0, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
mc_list2 <- list(Sepal.Width = 1, Petal.Width = -1)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = mc_list2,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$monotone_constraints,
|
||||||
|
c(0, 1, 0, -1, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
mc_vec <- c(0, 1, -1, 0, 0)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = mc_vec,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$monotone_constraints,
|
||||||
|
c(0, 1, -1, 0, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
mc_named_vec <- c(1, 1)
|
||||||
|
names(mc_named_vec) <- names(iris)[1:2]
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = mc_named_vec,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$monotone_constraints,
|
||||||
|
c(1, 1, 0, 0, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
mc_named_all <- c(0, -1, 1, 0, -1)
|
||||||
|
names(mc_named_all) <- rev(names(iris))
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = mc_named_all,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$monotone_constraints,
|
||||||
|
rev(mc_named_all) |> unname()
|
||||||
|
)
|
||||||
|
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = list(
|
||||||
|
Sepal.Width = 1,
|
||||||
|
Petal.Width = -1,
|
||||||
|
Sepal.Width = -1
|
||||||
|
),
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = rep(0, 6),
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process interaction_constraints", {
|
||||||
|
data(iris)
|
||||||
|
res <- process.x.and.col.args(iris, NULL, list(c(1L, 2L)), NULL, NULL, FALSE)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(0, 1))
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(iris, NULL, list(c(1.0, 2.0)), NULL, NULL, FALSE)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(0, 1))
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(iris, NULL, list(c(1, 2), c(3, 4)), NULL, NULL, FALSE)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(0, 1), c(2, 3))
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris, NULL, list(c("Sepal.Length", "Sepal.Width")), NULL, NULL, FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(0, 1))
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
as.matrix(iris),
|
||||||
|
NULL,
|
||||||
|
list(c("Sepal.Length", "Sepal.Width")),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(0, 1))
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
NULL,
|
||||||
|
list(c("Sepal.Width", "Petal.Length"), c("Sepal.Length", "Petal.Width", "Species")),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$params$interaction_constraints,
|
||||||
|
list(c(1, 2), c(0, 3, 4))
|
||||||
|
)
|
||||||
|
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, list(c(1L, 20L)), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, list(c(0L, 2L)), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, list(c("1", "2")), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, list(c("Sepal", "Petal")), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, c(1L, 2L), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, matrix(c(1L, 2L)), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
process.x.and.col.args(iris, NULL, list(c(1, 2.5)), NULL, NULL, FALSE)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Sparse matrices are casted to CSR for QDM", {
|
||||||
|
data(agaricus.test, package = "xgboost")
|
||||||
|
x <- agaricus.test$data
|
||||||
|
for (x_in in list(x, methods::as(x, "TsparseMatrix"))) {
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
x_in,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
TRUE
|
||||||
|
)
|
||||||
|
expect_s4_class(res$dmatrix_args$data, "dgRMatrix")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Process feature_weights", {
|
||||||
|
data(iris)
|
||||||
|
w_vector <- seq(1, 5)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = NULL,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = w_vector,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$feature_weights,
|
||||||
|
seq(1, 5)
|
||||||
|
)
|
||||||
|
|
||||||
|
w_named_vector <- seq(1, 5)
|
||||||
|
names(w_named_vector) <- rev(names(iris))
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = NULL,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = w_named_vector,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$feature_weights,
|
||||||
|
rev(seq(1, 5))
|
||||||
|
)
|
||||||
|
|
||||||
|
w_list <- list(
|
||||||
|
Species = 5,
|
||||||
|
Sepal.Length = 1,
|
||||||
|
Sepal.Width = 2,
|
||||||
|
Petal.Length = 3,
|
||||||
|
Petal.Width = 4
|
||||||
|
)
|
||||||
|
res <- process.x.and.col.args(
|
||||||
|
iris,
|
||||||
|
monotone_constraints = NULL,
|
||||||
|
interaction_constraints = NULL,
|
||||||
|
feature_weights = w_list,
|
||||||
|
lst_args = list(),
|
||||||
|
use_qdm = FALSE
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
res$dmatrix_args$feature_weights,
|
||||||
|
seq(1, 5)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("Whole function works", {
|
||||||
|
data(cancer, package = "survival")
|
||||||
|
y <- Surv(cancer$time, cancer$status - 1, type = "right")
|
||||||
|
x <- as.data.table(cancer)[, -c("time", "status")]
|
||||||
|
model <- xgboost(
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
monotone_constraints = list(age = -1),
|
||||||
|
nthreads = 1L,
|
||||||
|
nrounds = 5L,
|
||||||
|
eta = 3
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
attributes(model)$params$objective,
|
||||||
|
"survival:aft"
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
attributes(model)$metadata$n_targets,
|
||||||
|
1L
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
attributes(model)$params$monotone_constraints,
|
||||||
|
"(0,-1,0,0,0,0,0,0)"
|
||||||
|
)
|
||||||
|
expect_false(
|
||||||
|
"interaction_constraints" %in% names(attributes(model)$params)
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
attributes(model)$params$eta,
|
||||||
|
3
|
||||||
|
)
|
||||||
|
txt <- capture.output({
|
||||||
|
print(model)
|
||||||
|
})
|
||||||
|
expect_true(any(grepl("Objective: survival:aft", txt, fixed = TRUE)))
|
||||||
|
expect_true(any(grepl("monotone_constraints", txt, fixed = TRUE)))
|
||||||
|
expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE)))
|
||||||
|
expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE)))
|
||||||
|
})
|
||||||
@ -173,8 +173,9 @@ Build the model
|
|||||||
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
|
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
bst <- xgboost(x = sparse_matrix, y = output_vector,
|
||||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
params = list(max_depth = 4, eta = 1),
|
||||||
|
nthread = 2, nrounds = 10)
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -299,28 +300,28 @@ test <- agaricus.test
|
|||||||
|
|
||||||
#Random Forest - 1000 trees
|
#Random Forest - 1000 trees
|
||||||
bst <- xgboost(
|
bst <- xgboost(
|
||||||
data = train$data,
|
x = train$data,
|
||||||
label = train$label,
|
y = factor(train$label, levels = c(0, 1)),
|
||||||
max_depth = 4,
|
params = list(
|
||||||
num_parallel_tree = 1000,
|
max_depth = 4,
|
||||||
subsample = 0.5,
|
num_parallel_tree = 1000,
|
||||||
colsample_bytree = 0.5,
|
subsample = 0.5,
|
||||||
|
colsample_bytree = 0.5
|
||||||
|
),
|
||||||
nrounds = 1,
|
nrounds = 1,
|
||||||
objective = "binary:logistic",
|
|
||||||
nthread = 2
|
nthread = 2
|
||||||
)
|
)
|
||||||
|
|
||||||
#Boosting - 3 rounds
|
#Boosting - 3 rounds
|
||||||
bst <- xgboost(
|
bst <- xgboost(
|
||||||
data = train$data,
|
x = train$data,
|
||||||
label = train$label,
|
y = factor(train$label, levels = c(0, 1)),
|
||||||
max_depth = 4,
|
params = list(max_depth = 4),
|
||||||
nrounds = 3,
|
nrounds = 3,
|
||||||
objective = "binary:logistic",
|
|
||||||
nthread = 2
|
nthread = 2
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
> Note that the parameter `round` is set to `1`.
|
> Note that the parameter `nrounds` is set to `1`.
|
||||||
|
|
||||||
> [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
|
> [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
|
||||||
|
|||||||
@ -146,22 +146,19 @@ In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore,
|
|||||||
|
|
||||||
We will train decision tree model using the following parameters:
|
We will train decision tree model using the following parameters:
|
||||||
|
|
||||||
* `objective = "binary:logistic"`: we will train a binary classification model ;
|
* `objective = "binary:logistic"`: we will train a binary classification model (note that this is set automatically when `y` is a `factor`) ;
|
||||||
* `max_depth = 2`: the trees won't be deep, because our case is very simple ;
|
* `max_depth = 2`: the trees won't be deep, because our case is very simple ;
|
||||||
* `nthread = 2`: the number of CPU threads we are going to use;
|
* `nthread = 2`: the number of CPU threads we are going to use;
|
||||||
* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
|
* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
|
||||||
|
|
||||||
```{r trainingSparse, message=F, warning=F}
|
```{r trainingSparse, message=F, warning=F}
|
||||||
bstSparse <- xgboost(
|
bstSparse <- xgboost(
|
||||||
data = train$data
|
x = train$data
|
||||||
, label = train$label
|
, y = factor(train$label, levels = c(0, 1))
|
||||||
, params = list(
|
, objective = "binary:logistic"
|
||||||
max_depth = 2
|
, params = list(max_depth = 2, eta = 1)
|
||||||
, eta = 1
|
|
||||||
, nthread = 2
|
|
||||||
, objective = "binary:logistic"
|
|
||||||
)
|
|
||||||
, nrounds = 2
|
, nrounds = 2
|
||||||
|
, nthread = 2
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -175,15 +172,11 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
|
|||||||
|
|
||||||
```{r trainingDense, message=F, warning=F}
|
```{r trainingDense, message=F, warning=F}
|
||||||
bstDense <- xgboost(
|
bstDense <- xgboost(
|
||||||
data = as.matrix(train$data),
|
x = as.matrix(train$data),
|
||||||
label = train$label,
|
y = factor(train$label, levels = c(0, 1)),
|
||||||
params = list(
|
params = list(max_depth = 2, eta = 1),
|
||||||
max_depth = 2,
|
nrounds = 2,
|
||||||
eta = 1,
|
nthread = 2
|
||||||
nthread = 2,
|
|
||||||
objective = "binary:logistic"
|
|
||||||
),
|
|
||||||
nrounds = 2
|
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -193,7 +186,7 @@ bstDense <- xgboost(
|
|||||||
|
|
||||||
```{r trainingDmatrix, message=F, warning=F}
|
```{r trainingDmatrix, message=F, warning=F}
|
||||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
|
dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
|
||||||
bstDMatrix <- xgboost(
|
bstDMatrix <- xgb.train(
|
||||||
data = dtrain,
|
data = dtrain,
|
||||||
params = list(
|
params = list(
|
||||||
max_depth = 2,
|
max_depth = 2,
|
||||||
@ -213,7 +206,7 @@ One of the simplest way to see the training progress is to set the `verbose` opt
|
|||||||
|
|
||||||
```{r trainingVerbose0, message=T, warning=F}
|
```{r trainingVerbose0, message=T, warning=F}
|
||||||
# verbose = 0, no message
|
# verbose = 0, no message
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = dtrain
|
data = dtrain
|
||||||
, params = list(
|
, params = list(
|
||||||
max_depth = 2
|
max_depth = 2
|
||||||
@ -228,7 +221,7 @@ bst <- xgboost(
|
|||||||
|
|
||||||
```{r trainingVerbose1, message=T, warning=F}
|
```{r trainingVerbose1, message=T, warning=F}
|
||||||
# verbose = 1, print evaluation metric
|
# verbose = 1, print evaluation metric
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = dtrain
|
data = dtrain
|
||||||
, params = list(
|
, params = list(
|
||||||
max_depth = 2
|
max_depth = 2
|
||||||
@ -243,7 +236,7 @@ bst <- xgboost(
|
|||||||
|
|
||||||
```{r trainingVerbose2, message=T, warning=F}
|
```{r trainingVerbose2, message=T, warning=F}
|
||||||
# verbose = 2, also print information about tree
|
# verbose = 2, also print information about tree
|
||||||
bst <- xgboost(
|
bst <- xgb.train(
|
||||||
data = dtrain
|
data = dtrain
|
||||||
, params = list(
|
, params = list(
|
||||||
max_depth = 2
|
max_depth = 2
|
||||||
|
|||||||
@ -178,9 +178,10 @@ parameter:
|
|||||||
Using feature name instead
|
Using feature name instead
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
XGBoost's Python package supports using feature names instead of feature index for
|
XGBoost's Python and R packages support using feature names instead of feature index for
|
||||||
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
|
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
|
||||||
feature interaction constraint can be specified as ``[["f0", "f2"]]``.
|
feature interaction constraint can be specified as ``[["f0", "f2"]]`` (Python) or
|
||||||
|
``list(c("f0", "f2"))`` (R, when passing them to function ``xgboost()``).
|
||||||
|
|
||||||
**************
|
**************
|
||||||
Advanced topic
|
Advanced topic
|
||||||
|
|||||||
@ -97,7 +97,8 @@ Some other examples:
|
|||||||
Using feature names
|
Using feature names
|
||||||
*******************
|
*******************
|
||||||
|
|
||||||
XGBoost's Python package supports using feature names instead of feature index for
|
XGBoost's Python and R packages support using feature names instead of feature indices for
|
||||||
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
|
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
|
||||||
monotonic constraint can be specified as ``{"f0": 1, "f2": -1}``, and ``"f1"`` will
|
monotonic constraint can be specified as ``{"f0": 1, "f2": -1}`` (Python) or as
|
||||||
|
``list(f0=1, f2=-1)`` (R, when using 'xgboost()', but not 'xgb.train'), and ``"f1"`` will
|
||||||
default to ``0`` (no constraint).
|
default to ``0`` (no constraint).
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user