[R] Redesigned xgboost() interface skeleton (#10456)

---------

Co-authored-by: Michael Mayer <mayermichael79@gmail.com>
This commit is contained in:
david-cortes 2024-07-15 12:44:58 +02:00 committed by GitHub
parent 17c64300e3
commit ab982e7873
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
35 changed files with 1997 additions and 242 deletions

View File

@ -57,7 +57,8 @@ Suggests:
igraph (>= 1.0.1), igraph (>= 1.0.1),
float, float,
titanic, titanic,
RhpcBLASctl RhpcBLASctl,
survival
Depends: Depends:
R (>= 4.3.0) R (>= 4.3.0)
Imports: Imports:

View File

@ -13,6 +13,7 @@ S3method(predict,xgb.Booster)
S3method(print,xgb.Booster) S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix) S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous) S3method(print,xgb.cv.synchronous)
S3method(print,xgboost)
S3method(setinfo,xgb.Booster) S3method(setinfo,xgb.Booster)
S3method(setinfo,xgb.DMatrix) S3method(setinfo,xgb.DMatrix)
S3method(variable.names,xgb.Booster) S3method(variable.names,xgb.Booster)

View File

@ -30,6 +30,40 @@ NVL <- function(x, val) {
return(c('rank:pairwise', 'rank:ndcg', 'rank:map')) return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
} }
.OBJECTIVES_NON_DEFAULT_MODE <- function() {
return(c("reg:logistic", "binary:logitraw", "multi:softmax"))
}
.BINARY_CLASSIF_OBJECTIVES <- function() {
return(c("binary:logistic", "binary:hinge"))
}
.MULTICLASS_CLASSIF_OBJECTIVES <- function() {
return("multi:softprob")
}
.SURVIVAL_RIGHT_CENSORING_OBJECTIVES <- function() { # nolint
return(c("survival:cox", "survival:aft"))
}
.SURVIVAL_ALL_CENSORING_OBJECTIVES <- function() { # nolint
return("survival:aft")
}
.REGRESSION_OBJECTIVES <- function() {
return(c(
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
"reg:absoluteerror", "reg:quantileerror", "count:poisson", "reg:gamma", "reg:tweedie"
))
}
.MULTI_TARGET_OBJECTIVES <- function() {
return(c(
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
"reg:quantileerror", "reg:gamma"
))
}
# #
# Low-level functions for boosting -------------------------------------------- # Low-level functions for boosting --------------------------------------------

View File

@ -663,9 +663,8 @@ validate.features <- function(bst, newdata) {
#' data(agaricus.train, package = "xgboost") #' data(agaricus.train, package = "xgboost")
#' train <- agaricus.train #' train <- agaricus.train
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = train$data, #' data = xgb.DMatrix(train$data, label = train$label),
#' label = train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = 2, #' nthread = 2,
@ -767,9 +766,8 @@ xgb.attributes <- function(object) {
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' train <- agaricus.train #' train <- agaricus.train
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = train$data, #' data = xgb.DMatrix(train$data, label = train$label),
#' label = train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = nthread, #' nthread = nthread,
@ -817,9 +815,8 @@ xgb.config <- function(object) {
#' data(agaricus.train, package = "xgboost") #' data(agaricus.train, package = "xgboost")
#' train <- agaricus.train #' train <- agaricus.train
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = train$data, #' data = xgb.DMatrix(train$data, label = train$label),
#' label = train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = 2, #' nthread = 2,
@ -1230,9 +1227,8 @@ xgb.is.same.Booster <- function(obj1, obj2) {
#' data(agaricus.train, package = "xgboost") #' data(agaricus.train, package = "xgboost")
#' train <- agaricus.train #' train <- agaricus.train
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = train$data, #' data = xgb.DMatrix(train$data, label = train$label),
#' label = train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = 2, #' nthread = 2,

View File

@ -853,36 +853,6 @@ xgb.DMatrix.hasinfo <- function(object, info) {
} }
# get dmatrix from data, label
# internal helper method
xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
if (is.null(label)) {
stop("label must be provided when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
if (!is.null(weight)) {
setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (is.character(data)) {
data <- path.expand(data)
dtrain <- xgb.DMatrix(data[1])
} else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data
} else if (inherits(data, "data.frame")) {
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
} else {
stop("xgboost: invalid input data")
}
}
return(dtrain)
}
#' Dimensions of xgb.DMatrix #' Dimensions of xgb.DMatrix
#' #'
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. #' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.

View File

@ -29,8 +29,8 @@
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump' #' # save the model in file 'xgb.model.dump'
#' dump_path = file.path(tempdir(), 'model.dump') #' dump_path = file.path(tempdir(), 'model.dump')
#' xgb.dump(bst, dump_path, with_stats = TRUE) #' xgb.dump(bst, dump_path, with_stats = TRUE)

View File

@ -46,9 +46,8 @@
#' # binomial classification using "gbtree": #' # binomial classification using "gbtree":
#' data(agaricus.train, package = "xgboost") #' data(agaricus.train, package = "xgboost")
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = 2, #' nthread = 2,
@ -59,9 +58,8 @@
#' xgb.importance(model = bst) #' xgb.importance(model = bst)
#' #'
#' # binomial classification using "gblinear": #' # binomial classification using "gblinear":
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' booster = "gblinear", #' booster = "gblinear",
#' eta = 0.3, #' eta = 0.3,
#' nthread = 1, #' nthread = 1,
@ -73,9 +71,11 @@
#' # multiclass classification using "gbtree": #' # multiclass classification using "gbtree":
#' nclass <- 3 #' nclass <- 3
#' nrounds <- 10 #' nrounds <- 10
#' mbst <- xgboost( #' mbst <- xgb.train(
#' data = as.matrix(iris[, -5]), #' data = xgb.DMatrix(
#' label = as.numeric(iris$Species) - 1, #' as.matrix(iris[, -5]),
#' label = as.numeric(iris$Species) - 1
#' ),
#' max_depth = 3, #' max_depth = 3,
#' eta = 0.2, #' eta = 0.2,
#' nthread = 2, #' nthread = 2,
@ -99,9 +99,11 @@
#' ) #' )
#' #'
#' # multiclass classification using "gblinear": #' # multiclass classification using "gblinear":
#' mbst <- xgboost( #' mbst <- xgb.train(
#' data = scale(as.matrix(iris[, -5])), #' data = xgb.DMatrix(
#' label = as.numeric(iris$Species) - 1, #' scale(as.matrix(iris[, -5])),
#' label = as.numeric(iris$Species) - 1
#' ),
#' booster = "gblinear", #' booster = "gblinear",
#' eta = 0.2, #' eta = 0.2,
#' nthread = 1, #' nthread = 1,

View File

@ -43,9 +43,8 @@
#' nthread <- 1 #' nthread <- 1
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 2, #' max_depth = 2,
#' eta = 1, #' eta = 1,
#' nthread = nthread, #' nthread = nthread,

View File

@ -48,9 +48,8 @@
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' #'
#' ## Change max_depth to a higher number to get a more significant result #' ## Change max_depth to a higher number to get a more significant result
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 6, #' max_depth = 6,
#' nthread = nthread, #' nthread = nthread,
#' nrounds = 50, #' nrounds = 50,

View File

@ -51,9 +51,8 @@
#' nthread <- 2 #' nthread <- 2
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 3, #' max_depth = 3,
#' eta = 1, #' eta = 1,
#' nthread = nthread, #' nthread = nthread,

View File

@ -35,9 +35,8 @@
#' nthread <- 2 #' nthread <- 2
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 15, #' max_depth = 15,
#' eta = 1, #' eta = 1,
#' nthread = nthread, #' nthread = nthread,

View File

@ -82,9 +82,8 @@
#' data.table::setDTthreads(nthread) #' data.table::setDTthreads(nthread)
#' nrounds <- 20 #' nrounds <- 20
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
#' agaricus.train$label,
#' nrounds = nrounds, #' nrounds = nrounds,
#' eta = 0.1, #' eta = 0.1,
#' max_depth = 3, #' max_depth = 3,
@ -108,9 +107,8 @@
#' set.seed(123) #' set.seed(123)
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
#' #'
#' mbst <- xgboost( #' mbst <- xgb.train(
#' data = x, #' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
#' label = as.numeric(iris$Species) - 1,
#' nrounds = nrounds, #' nrounds = nrounds,
#' max_depth = 2, #' max_depth = 2,
#' eta = 0.3, #' eta = 0.3,

View File

@ -68,9 +68,8 @@
#' @examples #' @examples
#' data(agaricus.train, package = "xgboost") #' data(agaricus.train, package = "xgboost")
#' #'
#' bst <- xgboost( #' bst <- xgb.train(
#' data = agaricus.train$data, #' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
#' label = agaricus.train$label,
#' max_depth = 3, #' max_depth = 3,
#' eta = 1, #' eta = 1,
#' nthread = 2, #' nthread = 2,

View File

@ -182,12 +182,6 @@
#' as R attributes, and thus do not get saved when using XGBoost's own serializaters like #' as R attributes, and thus do not get saved when using XGBoost's own serializaters like
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). #' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' @param label vector of response values. Should not be provided when data is
#' a local data file name or an \code{xgb.DMatrix}.
#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
#' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
#' This parameter is only used when input is a dense matrix.
#' @param weight a vector indicating the weight for each row of the input.
#' #'
#' @return #' @return
#' An object of class \code{xgb.Booster}. #' An object of class \code{xgb.Booster}.
@ -328,12 +322,10 @@
#' early_stopping_rounds = 3) #' early_stopping_rounds = 3)
#' #'
#' ## An 'xgboost' interface example: #' ## An 'xgboost' interface example:
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, #' bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
#' max_depth = 2, eta = 1, nthread = nthread, nrounds = 2, #' params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
#' objective = "binary:logistic")
#' pred <- predict(bst, agaricus.test$data) #' pred <- predict(bst, agaricus.test$data)
#' #'
#' @rdname xgb.train
#' @export #' @export
xgb.train <- function(params = list(), data, nrounds, evals = list(), xgb.train <- function(params = list(), data, nrounds, evals = list(),
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,

File diff suppressed because it is too large Load Diff

View File

@ -16,29 +16,28 @@ class(train$data)
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input # note: we are putting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
print("Training xgboost with sparseMatrix") print("Training xgboost with sparseMatrix")
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2, bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)),
nthread = 2, objective = "binary:logistic") params = list(max_depth = 2, eta = 1),
nrounds = 2, nthread = 2)
# alternatively, you can put in dense matrix, i.e. basic R-matrix # alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training xgboost with Matrix") print("Training xgboost with Matrix")
bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2, bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)),
nthread = 2, objective = "binary:logistic") params = list(max_depth = 2, eta = 1),
nrounds = 2, nthread = 2)
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
print("Training xgboost with xgb.DMatrix") print("Training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic")
objective = "binary:logistic") bst <- xgb.train(data = dtrain, params = params, nrounds = 2)
# Verbose = 0,1,2 # Verbose = 0,1,2
print("Train xgboost with verbose 0, no message") print("Train xgboost with verbose 0, no message")
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0)
nthread = 2, objective = "binary:logistic", verbose = 0)
print("Train xgboost with verbose 1, print evaluation metric") print("Train xgboost with verbose 1, print evaluation metric")
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1)
nthread = 2, objective = "binary:logistic", verbose = 1)
print("Train xgboost with verbose 2, also print information about tree") print("Train xgboost with verbose 2, also print information about tree")
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2)
nthread = 2, objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LIBSVM format input # you can also specify data as file path to a LIBSVM format input
# since we do not have this file with us, the following line is just for illustration # since we do not have this file with us, the following line is just for illustration

View File

@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
data(agaricus.train, package = "xgboost") data(agaricus.train, package = "xgboost")
train <- agaricus.train train <- agaricus.train
bst <- xgboost( bst <- xgb.train(
data = train$data, data = xgb.DMatrix(train$data, label = train$label),
label = train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = 2, nthread = 2,

View File

@ -64,9 +64,8 @@ example of these behaviors).
data(agaricus.train, package = "xgboost") data(agaricus.train, package = "xgboost")
train <- agaricus.train train <- agaricus.train
bst <- xgboost( bst <- xgb.train(
data = train$data, data = xgb.DMatrix(train$data, label = train$label),
label = train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = 2, nthread = 2,

View File

@ -35,9 +35,8 @@ nthread <- 1
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
bst <- xgboost( bst <- xgb.train(
data = train$data, data = xgb.DMatrix(train$data, label = train$label),
label = train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = nthread, nthread = nthread,

View File

@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
# save the model in file 'xgb.model.dump' # save the model in file 'xgb.model.dump'
dump_path = file.path(tempdir(), 'model.dump') dump_path = file.path(tempdir(), 'model.dump')
xgb.dump(bst, dump_path, with_stats = TRUE) xgb.dump(bst, dump_path, with_stats = TRUE)

View File

@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
# binomial classification using "gbtree": # binomial classification using "gbtree":
data(agaricus.train, package = "xgboost") data(agaricus.train, package = "xgboost")
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = 2, nthread = 2,
@ -83,9 +82,8 @@ bst <- xgboost(
xgb.importance(model = bst) xgb.importance(model = bst)
# binomial classification using "gblinear": # binomial classification using "gblinear":
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
booster = "gblinear", booster = "gblinear",
eta = 0.3, eta = 0.3,
nthread = 1, nthread = 1,
@ -97,9 +95,11 @@ xgb.importance(model = bst)
# multiclass classification using "gbtree": # multiclass classification using "gbtree":
nclass <- 3 nclass <- 3
nrounds <- 10 nrounds <- 10
mbst <- xgboost( mbst <- xgb.train(
data = as.matrix(iris[, -5]), data = xgb.DMatrix(
label = as.numeric(iris$Species) - 1, as.matrix(iris[, -5]),
label = as.numeric(iris$Species) - 1
),
max_depth = 3, max_depth = 3,
eta = 0.2, eta = 0.2,
nthread = 2, nthread = 2,
@ -123,9 +123,11 @@ xgb.importance(
) )
# multiclass classification using "gblinear": # multiclass classification using "gblinear":
mbst <- xgboost( mbst <- xgb.train(
data = scale(as.matrix(iris[, -5])), data = xgb.DMatrix(
label = as.numeric(iris$Species) - 1, scale(as.matrix(iris[, -5])),
label = as.numeric(iris$Species) - 1
),
booster = "gblinear", booster = "gblinear",
eta = 0.2, eta = 0.2,
nthread = 1, nthread = 1,

View File

@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost")
nthread <- 1 nthread <- 1
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = nthread, nthread = nthread,

View File

@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero.
data(agaricus.train, package = "xgboost") data(agaricus.train, package = "xgboost")
train <- agaricus.train train <- agaricus.train
bst <- xgboost( bst <- xgb.train(
data = train$data, data = xgb.DMatrix(train$data, label = train$label),
label = train$label,
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
nthread = 2, nthread = 2,

View File

@ -73,9 +73,8 @@ nthread <- 2
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
## Change max_depth to a higher number to get a more significant result ## Change max_depth to a higher number to get a more significant result
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
max_depth = 6, max_depth = 6,
nthread = nthread, nthread = nthread,
nrounds = 50, nrounds = 50,

View File

@ -88,9 +88,8 @@ data(agaricus.train)
nthread <- 2 nthread <- 2
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
max_depth = 3, max_depth = 3,
eta = 1, eta = 1,
nthread = nthread, nthread = nthread,

View File

@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost")
nthread <- 2 nthread <- 2
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
label = agaricus.train$label,
max_depth = 15, max_depth = 15,
eta = 1, eta = 1,
nthread = nthread, nthread = nthread,

View File

@ -135,9 +135,8 @@ nthread <- 1
data.table::setDTthreads(nthread) data.table::setDTthreads(nthread)
nrounds <- 20 nrounds <- 20
bst <- xgboost( bst <- xgb.train(
agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
agaricus.train$label,
nrounds = nrounds, nrounds = nrounds,
eta = 0.1, eta = 0.1,
max_depth = 3, max_depth = 3,
@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5])
set.seed(123) set.seed(123)
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
mbst <- xgboost( mbst <- xgb.train(
data = x, data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
label = as.numeric(iris$Species) - 1,
nrounds = nrounds, nrounds = nrounds,
max_depth = 2, max_depth = 2,
eta = 0.3, eta = 0.3,

View File

@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back
\examples{ \examples{
data(agaricus.train, package = "xgboost") data(agaricus.train, package = "xgboost")
bst <- xgboost( bst <- xgb.train(
data = agaricus.train$data, data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
label = agaricus.train$label,
max_depth = 3, max_depth = 3,
eta = 1, eta = 1,
nthread = 2, nthread = 2,

View File

@ -1,8 +1,7 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.train.R, R/xgboost.R % Please edit documentation in R/xgb.train.R
\name{xgb.train} \name{xgb.train}
\alias{xgb.train} \alias{xgb.train}
\alias{xgboost}
\title{eXtreme Gradient Boosting Training} \title{eXtreme Gradient Boosting Training}
\usage{ \usage{
xgb.train( xgb.train(
@ -22,24 +21,6 @@ xgb.train(
callbacks = list(), callbacks = list(),
... ...
) )
xgboost(
data = NULL,
label = NULL,
missing = NA,
weight = NULL,
params = list(),
nrounds,
verbose = 1,
print_every_n = 1L,
early_stopping_rounds = NULL,
maximize = NULL,
save_period = NULL,
save_name = "xgboost.model",
xgb_model = NULL,
callbacks = list(),
...
)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. The complete list of parameters is \item{params}{the list of parameters. The complete list of parameters is
@ -240,15 +221,6 @@ to customize the training process.
}\if{html}{\out{</div>}}} }\if{html}{\out{</div>}}}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
\item{label}{vector of response values. Should not be provided when data is
a local data file name or an \code{xgb.DMatrix}.}
\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
This parameter is only used when input is a dense matrix.}
\item{weight}{a vector indicating the weight for each row of the input.}
} }
\value{ \value{
An object of class \code{xgb.Booster}. An object of class \code{xgb.Booster}.
@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
early_stopping_rounds = 3) early_stopping_rounds = 3)
## An 'xgboost' interface example: ## An 'xgboost' interface example:
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
max_depth = 2, eta = 1, nthread = nthread, nrounds = 2, params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
objective = "binary:logistic")
pred <- predict(bst, agaricus.test$data) pred <- predict(bst, agaricus.test$data)
} }

213
R-package/man/xgboost.Rd Normal file
View File

@ -0,0 +1,213 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgboost.R
\name{xgboost}
\alias{xgboost}
\title{Fit XGBoost Model}
\usage{
xgboost(
x,
y,
objective = NULL,
nrounds = 100L,
weights = NULL,
verbosity = 0L,
nthreads = parallel::detectCores(),
seed = 0L,
monotone_constraints = NULL,
interaction_constraints = NULL,
feature_weights = NULL,
base_margin = NULL,
...
)
}
\arguments{
\item{x}{The features / covariates. Can be passed as:\itemize{
\item A numeric or integer `matrix`.
\item A `data.frame`, in which all columns are one of the following types:\itemize{
\item `numeric`
\item `integer`
\item `logical`
\item `factor`
}
Columns of `factor` type will be assumed to be categorical, while other column types will
be assumed to be numeric.
\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
}
Note that categorical features are only supported for `data.frame` inputs, and are automatically
determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
variants that would allow something like categorical features on sparse matrices.}
\item{y}{The response variable. Allowed values are:\itemize{
\item A numeric or integer vector (for regression tasks).
\item A factor or character vector (for binary and multi-class classification tasks).
\item A logical (boolean) vector (for binary classification tasks).
\item A numeric or integer matrix or `data.frame` with numeric/integer columns
(for multi-task regression tasks).
\item A `Surv` object from the `survival` package (for survival tasks).
}
If `objective` is `NULL`, the right task will be determined automatically based on
the class of `y`.
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
can only be used with classification objectives and vice-versa.
For binary classification, the last factor level of `y` will be used as the "positive"
class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
set as the last level.}
\item{objective}{Optimization objective to minimize based on the supplied data, to be passed
by name as a string / character (e.g. `reg:absoluteerror`). See the
\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
Learning Task Parameters} page for more detailed information on allowed values.
If `NULL` (the default), will be automatically determined from `y` according to the following
logic:\itemize{
\item If `y` is a factor with 2 levels, will use `binary:logistic`.
\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
will be determined automatically, should not be passed under `params`).
\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
the only types supported are left / right / interval censored).
\item Otherwise, will use `reg:squarederror`.
}
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
can only be used with classification objectives and vice-versa.
Note that not all possible `objective` values supported by the core XGBoost library are allowed
here - for example, objectives which are a variation of another but with a different default
prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
ranking objectives, nor custom objectives at the moment.}
\item{nrounds}{Number of boosting iterations / rounds.
Note that the number of default boosting rounds here is not automatically tuned, and different
problems will have vastly different optimal numbers of boosting rounds.}
\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
will have the same weight.
If not `NULL`, should be passed as a numeric vector with length matching to the number of
rows in `x`.}
\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
2 (info), and 3 (debug).}
\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.}
\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random
number using R's PRNG system to use as seed.}
\item{monotone_constraints}{Optional monotonicity constraints for features.
Can be passed either as a named list (when `x` has column names), or as a vector. If passed
as a vector and `x` has column names, will try to match the elements by name.
A value of `+1` for a given feature makes the model predictions / scores constrained to be
a monotonically increasing function of that feature (that is, as the value of the feature
increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
decreasing function. A value of zero imposes no constraint.
The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
case the columns that are not referred to in `monotone_constraints` will be assumed to have
a value of zero (no constraint imposed on the model for those features).
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
Monotonic Constraints} for a more detailed explanation.}
\item{interaction_constraints}{Constraints for interaction representing permitted interactions.
The constraints must be specified in the form of a list of vectors referencing columns in the
data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
starting at 1 - i.e. the first sublist references the first and second columns) or
`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
columns by names), where each vector is a group of indices of features that are allowed to
interact with each other.
See the tutorial
\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
Feature Interaction Constraints} for more information.}
\item{feature_weights}{Feature weights for column sampling.
Can be passed either as a vector with length matching to columns of `x`, or as a named
list (only if `x` has column names) with names matching to columns of 'x'. If it is a
named vector, will try to match the entries to column names of `x` by name.
If `NULL` (the default), all columns will have the same weight.}
\item{base_margin}{Base margin used for boosting from existing model.
If passing it, will start the gradient boosting procedure from the scores that are provided
here - for example, one can pass the raw scores from a previous model, or some per-observation
offset, or similar.
Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
with the same number of rows as `x` and number of columns corresponding to number of optimization
targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
number of columns matching to number of classes in the data).
Note that, if it contains more than one column, then columns will not be matched by name to
the corresponding `y` - `base_margin` should have the same column order that the model will use
(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
`levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
added (controllable through parameter `base_score` instead) when `base_margin` is not passed.}
\item{...}{Other training parameters. See the online documentation
\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
details about possible values and what they do.
Note that not all possible values from the core XGBoost library are allowed as `params` for
'xgboost()' - in particular, values which require an already-fitted booster object (such as
`process_type`) are not accepted here.}
}
\value{
A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
additional attribute `metadata` containing information which is used for formatting prediction
outputs, such as class names for classification problems.
}
\description{
Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
This function is intended to provide a more user-friendly interface for XGBoost that follows
R's conventions for model fitting and predictions, but which doesn't expose all of the
possible functionalities of the core XGBoost library.
See \link{xgb.train} for a more flexible low-level alternative which is similar across different
language bindings of XGBoost and which exposes the full library's functionalities.
}
\details{
For package authors using `xgboost` as a dependency, it is highly recommended to use
\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
and performs fewer data conversions and copies along the way.
}
\examples{
library(xgboost)
data(mtcars)
# Fit a small regression model on the mtcars data
model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
predict(model_regression, mtcars, validate_features = TRUE)
# Task objective is determined automatically according to the type of 'y'
data(iris)
model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
predict(model_classif, iris, validate_features = TRUE)
}
\references{
\itemize{
\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
data mining. 2016.
\item \url{https://xgboost.readthedocs.io/en/stable/}
}
}

View File

@ -0,0 +1,623 @@
library(survival)
library(data.table)
test_that("Auto determine objective", {
y_num <- seq(1, 10)
res_num <- process.y.margin.and.objective(y_num, NULL, NULL, NULL)
expect_equal(res_num$params$objective, "reg:squarederror")
y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
res_bin <- process.y.margin.and.objective(y_bin, NULL, NULL, NULL)
expect_equal(res_bin$params$objective, "binary:logistic")
y_multi <- factor(c('a', 'b', 'a', 'b', 'c'), c('a', 'b', 'c'))
res_multi <- process.y.margin.and.objective(y_multi, NULL, NULL, NULL)
expect_equal(res_multi$params$objective, "multi:softprob")
y_surv <- Surv(1:10, rep(c(0, 1), 5), type = "right")
res_surv <- process.y.margin.and.objective(y_surv, NULL, NULL, NULL)
expect_equal(res_surv$params$objective, "survival:aft")
y_multicol <- matrix(seq(1, 20), nrow = 5)
res_multicol <- process.y.margin.and.objective(y_multicol, NULL, NULL, NULL)
expect_equal(res_multicol$params$objective, "reg:squarederror")
})
test_that("Process vectors", {
y <- seq(1, 10)
for (y_inp in list(as.integer(y), as.numeric(y))) {
res <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
expect_equal(
res$dmatrix_args$label,
y
)
expect_equal(
res$params$objective,
"reg:pseudohubererror"
)
}
})
test_that("Process factors", {
y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
expect_error({
process.y.margin.and.objective(y_bin, NULL, "multi:softprob", NULL)
})
for (bin_obj in c("binary:logistic", "binary:hinge")) {
for (y_inp in list(y_bin, as.ordered(y_bin))) {
res_bin <- process.y.margin.and.objective(y_inp, NULL, bin_obj, NULL)
expect_equal(
res_bin$dmatrix_args$label,
c(0, 1, 0, 1)
)
expect_equal(
res_bin$metadata$y_levels,
c('a', 'b')
)
expect_equal(
res_bin$params$objective,
bin_obj
)
}
}
y_bin2 <- factor(c(1, 0, 1, 0), c(1, 0))
res_bin <- process.y.margin.and.objective(y_bin2, NULL, "binary:logistic", NULL)
expect_equal(
res_bin$dmatrix_args$label,
c(0, 1, 0, 1)
)
expect_equal(
res_bin$metadata$y_levels,
c("1", "0")
)
y_bin3 <- c(TRUE, FALSE, TRUE)
res_bin <- process.y.margin.and.objective(y_bin3, NULL, "binary:logistic", NULL)
expect_equal(
res_bin$dmatrix_args$label,
c(1, 0, 1)
)
expect_equal(
res_bin$metadata$y_levels,
c("FALSE", "TRUE")
)
y_multi <- factor(c('a', 'b', 'c', 'd', 'a', 'b'), c('a', 'b', 'c', 'd'))
expect_error({
process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
})
expect_error({
process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
})
res_multi <- process.y.margin.and.objective(y_multi, NULL, "multi:softprob", NULL)
expect_equal(
res_multi$dmatrix_args$label,
c(0, 1, 2, 3, 0, 1)
)
expect_equal(
res_multi$metadata$y_levels,
c('a', 'b', 'c', 'd')
)
expect_equal(
res_multi$params$num_class,
4
)
expect_equal(
res_multi$params$objective,
"multi:softprob"
)
})
test_that("Process survival objects", {
data(cancer, package = "survival")
y_right <- Surv(cancer$time, cancer$status - 1, type = "right")
res_cox <- process.y.margin.and.objective(y_right, NULL, "survival:cox", NULL)
expect_equal(
res_cox$dmatrix_args$label,
ifelse(cancer$status == 2, cancer$time, -cancer$time)
)
expect_equal(
res_cox$params$objective,
"survival:cox"
)
res_aft <- process.y.margin.and.objective(y_right, NULL, "survival:aft", NULL)
expect_equal(
res_aft$dmatrix_args$label_lower_bound,
cancer$time
)
expect_equal(
res_aft$dmatrix_args$label_upper_bound,
ifelse(cancer$status == 2, cancer$time, Inf)
)
expect_equal(
res_aft$params$objective,
"survival:aft"
)
y_left <- Surv(seq(1, 4), c(1, 0, 1, 0), type = "left")
expect_error({
process.y.margin.and.objective(y_left, NULL, "survival:cox", NULL)
})
res_aft <- process.y.margin.and.objective(y_left, NULL, "survival:aft", NULL)
expect_equal(
res_aft$dmatrix_args$label_lower_bound,
c(1, 0, 3, 0)
)
expect_equal(
res_aft$dmatrix_args$label_upper_bound,
seq(1, 4)
)
expect_equal(
res_aft$params$objective,
"survival:aft"
)
y_interval <- Surv(
time = c(1, 5, 2, 10, 3),
time2 = c(2, 5, 2.5, 10, 3),
event = c(3, 1, 3, 0, 2),
type = "interval"
)
expect_error({
process.y.margin.and.objective(y_interval, NULL, "survival:cox", NULL)
})
res_aft <- process.y.margin.and.objective(y_interval, NULL, "survival:aft", NULL)
expect_equal(
res_aft$dmatrix_args$label_lower_bound,
c(1, 5, 2, 10, 0)
)
expect_equal(
res_aft$dmatrix_args$label_upper_bound,
c(2, 5, 2.5, Inf, 3)
)
expect_equal(
res_aft$params$objective,
"survival:aft"
)
y_interval_neg <- Surv(
time = c(1, -5, 2, 10, 3),
time2 = c(2, -5, 2.5, 10, 3),
event = c(3, 1, 3, 0, 2),
type = "interval"
)
expect_error({
process.y.margin.and.objective(y_interval_neg, NULL, "survival:aft", NULL)
})
})
test_that("Process multi-target", {
data(mtcars)
y_multi <- data.frame(
y1 = mtcars$mpg,
y2 = mtcars$mpg ^ 2
)
for (y_inp in list(y_multi, as.matrix(y_multi), data.table::as.data.table(y_multi))) {
res_multi <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
expect_equal(
res_multi$dmatrix_args$label,
as.matrix(y_multi)
)
expect_equal(
res_multi$metadata$y_names,
c("y1", "y2")
)
expect_equal(
res_multi$params$objective,
"reg:pseudohubererror"
)
}
expect_error({
process.y.margin.and.objective(y_multi, NULL, "count:poisson", NULL)
})
y_bad <- data.frame(
c1 = seq(1, 3),
c2 = rep(as.Date("2024-01-01"), 3)
)
expect_error({
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
})
y_bad <- data.frame(
c1 = seq(1, 3),
c2 = factor(c('a', 'b', 'a'), c('a', 'b'))
)
expect_error({
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
})
y_bad <- seq(1, 20)
dim(y_bad) <- c(5, 2, 2)
expect_error({
process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
})
})
test_that("Process base_margin", {
y <- seq(101, 110)
bm_good <- seq(1, 10)
for (bm in list(bm_good, as.matrix(bm_good), as.data.frame(as.matrix(bm_good)))) {
res <- process.y.margin.and.objective(y, bm, "reg:squarederror", NULL)
expect_equal(
res$dmatrix_args$base_margin,
seq(1, 10)
)
}
expect_error({
process.y.margin.and.objective(y, 5, "reg:squarederror", NULL)
})
expect_error({
process.y.margin.and.objective(y, seq(1, 5), "reg:squarederror", NULL)
})
expect_error({
process.y.margin.and.objective(y, matrix(seq(1, 20), ncol = 2), "reg:squarederror", NULL)
})
expect_error({
process.y.margin.and.objective(
y,
as.data.frame(matrix(seq(1, 20), ncol = 2)),
"reg:squarederror",
NULL
)
})
y <- factor(c('a', 'b', 'c', 'a'))
bm_good <- matrix(seq(1, 12), ncol = 3)
for (bm in list(bm_good, as.data.frame(bm_good))) {
res <- process.y.margin.and.objective(y, bm, "multi:softprob", NULL)
expect_equal(
res$dmatrix_args$base_margin |> unname(),
matrix(seq(1, 12), ncol = 3)
)
}
expect_error({
process.y.margin.and.objective(y, as.numeric(bm_good), "multi:softprob", NULL)
})
expect_error({
process.y.margin.and.objective(y, 5, "multi:softprob", NULL)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, 1], "multi:softprob", NULL)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "multi:softprob", NULL)
})
expect_error({
process.y.margin.and.objective(y, bm_good[c(1, 2), ], "multi:softprob", NULL)
})
y <- seq(101, 110)
bm_good <- matrix(seq(1, 30), ncol = 3)
params <- list(quantile_alpha = c(0.1, 0.5, 0.9))
for (bm in list(bm_good, as.data.frame(bm_good))) {
res <- process.y.margin.and.objective(y, bm, "reg:quantileerror", params)
expect_equal(
res$dmatrix_args$base_margin |> unname(),
matrix(seq(1, 30), ncol = 3)
)
}
expect_error({
process.y.margin.and.objective(y, as.numeric(bm_good), "reg:quantileerror", params)
})
expect_error({
process.y.margin.and.objective(y, 5, "reg:quantileerror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, 1], "reg:quantileerror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:quantileerror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:quantileerror", params)
})
y <- matrix(seq(101, 130), ncol = 3)
for (bm in list(bm_good, as.data.frame(bm_good))) {
res <- process.y.margin.and.objective(y, bm, "reg:squarederror", params)
expect_equal(
res$dmatrix_args$base_margin |> unname(),
matrix(seq(1, 30), ncol = 3)
)
}
expect_error({
process.y.margin.and.objective(y, as.numeric(bm_good), "reg:squarederror", params)
})
expect_error({
process.y.margin.and.objective(y, 5, "reg:squarederror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, 1], "reg:squarederror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:squarederror", params)
})
expect_error({
process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:squarederror", params)
})
})
test_that("Process monotone constraints", {
data(iris)
mc_list <- list(Sepal.Width = 1)
res <- process.x.and.col.args(
iris,
monotone_constraints = mc_list,
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$params$monotone_constraints,
c(0, 1, 0, 0, 0)
)
mc_list2 <- list(Sepal.Width = 1, Petal.Width = -1)
res <- process.x.and.col.args(
iris,
monotone_constraints = mc_list2,
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$params$monotone_constraints,
c(0, 1, 0, -1, 0)
)
mc_vec <- c(0, 1, -1, 0, 0)
res <- process.x.and.col.args(
iris,
monotone_constraints = mc_vec,
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$params$monotone_constraints,
c(0, 1, -1, 0, 0)
)
mc_named_vec <- c(1, 1)
names(mc_named_vec) <- names(iris)[1:2]
res <- process.x.and.col.args(
iris,
monotone_constraints = mc_named_vec,
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$params$monotone_constraints,
c(1, 1, 0, 0, 0)
)
mc_named_all <- c(0, -1, 1, 0, -1)
names(mc_named_all) <- rev(names(iris))
res <- process.x.and.col.args(
iris,
monotone_constraints = mc_named_all,
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$params$monotone_constraints,
rev(mc_named_all) |> unname()
)
expect_error({
process.x.and.col.args(
iris,
monotone_constraints = list(
Sepal.Width = 1,
Petal.Width = -1,
Sepal.Width = -1
),
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
})
expect_error({
process.x.and.col.args(
iris,
monotone_constraints = rep(0, 6),
interaction_constraints = NULL,
feature_weights = NULL,
lst_args = list(),
use_qdm = FALSE
)
})
})
test_that("Process interaction_constraints", {
data(iris)
res <- process.x.and.col.args(iris, NULL, list(c(1L, 2L)), NULL, NULL, FALSE)
expect_equal(
res$params$interaction_constraints,
list(c(0, 1))
)
res <- process.x.and.col.args(iris, NULL, list(c(1.0, 2.0)), NULL, NULL, FALSE)
expect_equal(
res$params$interaction_constraints,
list(c(0, 1))
)
res <- process.x.and.col.args(iris, NULL, list(c(1, 2), c(3, 4)), NULL, NULL, FALSE)
expect_equal(
res$params$interaction_constraints,
list(c(0, 1), c(2, 3))
)
res <- process.x.and.col.args(
iris, NULL, list(c("Sepal.Length", "Sepal.Width")), NULL, NULL, FALSE
)
expect_equal(
res$params$interaction_constraints,
list(c(0, 1))
)
res <- process.x.and.col.args(
as.matrix(iris),
NULL,
list(c("Sepal.Length", "Sepal.Width")),
NULL,
NULL,
FALSE
)
expect_equal(
res$params$interaction_constraints,
list(c(0, 1))
)
res <- process.x.and.col.args(
iris,
NULL,
list(c("Sepal.Width", "Petal.Length"), c("Sepal.Length", "Petal.Width", "Species")),
NULL,
NULL,
FALSE
)
expect_equal(
res$params$interaction_constraints,
list(c(1, 2), c(0, 3, 4))
)
expect_error({
process.x.and.col.args(iris, NULL, list(c(1L, 20L)), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, list(c(0L, 2L)), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, list(c("1", "2")), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, list(c("Sepal", "Petal")), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, c(1L, 2L), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, matrix(c(1L, 2L)), NULL, NULL, FALSE)
})
expect_error({
process.x.and.col.args(iris, NULL, list(c(1, 2.5)), NULL, NULL, FALSE)
})
})
test_that("Sparse matrices are casted to CSR for QDM", {
data(agaricus.test, package = "xgboost")
x <- agaricus.test$data
for (x_in in list(x, methods::as(x, "TsparseMatrix"))) {
res <- process.x.and.col.args(
x_in,
NULL,
NULL,
NULL,
NULL,
TRUE
)
expect_s4_class(res$dmatrix_args$data, "dgRMatrix")
}
})
test_that("Process feature_weights", {
data(iris)
w_vector <- seq(1, 5)
res <- process.x.and.col.args(
iris,
monotone_constraints = NULL,
interaction_constraints = NULL,
feature_weights = w_vector,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$dmatrix_args$feature_weights,
seq(1, 5)
)
w_named_vector <- seq(1, 5)
names(w_named_vector) <- rev(names(iris))
res <- process.x.and.col.args(
iris,
monotone_constraints = NULL,
interaction_constraints = NULL,
feature_weights = w_named_vector,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$dmatrix_args$feature_weights,
rev(seq(1, 5))
)
w_list <- list(
Species = 5,
Sepal.Length = 1,
Sepal.Width = 2,
Petal.Length = 3,
Petal.Width = 4
)
res <- process.x.and.col.args(
iris,
monotone_constraints = NULL,
interaction_constraints = NULL,
feature_weights = w_list,
lst_args = list(),
use_qdm = FALSE
)
expect_equal(
res$dmatrix_args$feature_weights,
seq(1, 5)
)
})
test_that("Whole function works", {
data(cancer, package = "survival")
y <- Surv(cancer$time, cancer$status - 1, type = "right")
x <- as.data.table(cancer)[, -c("time", "status")]
model <- xgboost(
x,
y,
monotone_constraints = list(age = -1),
nthreads = 1L,
nrounds = 5L,
eta = 3
)
expect_equal(
attributes(model)$params$objective,
"survival:aft"
)
expect_equal(
attributes(model)$metadata$n_targets,
1L
)
expect_equal(
attributes(model)$params$monotone_constraints,
"(0,-1,0,0,0,0,0,0)"
)
expect_false(
"interaction_constraints" %in% names(attributes(model)$params)
)
expect_equal(
attributes(model)$params$eta,
3
)
txt <- capture.output({
print(model)
})
expect_true(any(grepl("Objective: survival:aft", txt, fixed = TRUE)))
expect_true(any(grepl("monotone_constraints", txt, fixed = TRUE)))
expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE)))
expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE)))
})

View File

@ -173,8 +173,9 @@ Build the model
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
```{r} ```{r}
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, bst <- xgboost(x = sparse_matrix, y = output_vector,
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic") params = list(max_depth = 4, eta = 1),
nthread = 2, nrounds = 10)
``` ```
@ -299,28 +300,28 @@ test <- agaricus.test
#Random Forest - 1000 trees #Random Forest - 1000 trees
bst <- xgboost( bst <- xgboost(
data = train$data, x = train$data,
label = train$label, y = factor(train$label, levels = c(0, 1)),
max_depth = 4, params = list(
num_parallel_tree = 1000, max_depth = 4,
subsample = 0.5, num_parallel_tree = 1000,
colsample_bytree = 0.5, subsample = 0.5,
colsample_bytree = 0.5
),
nrounds = 1, nrounds = 1,
objective = "binary:logistic",
nthread = 2 nthread = 2
) )
#Boosting - 3 rounds #Boosting - 3 rounds
bst <- xgboost( bst <- xgboost(
data = train$data, x = train$data,
label = train$label, y = factor(train$label, levels = c(0, 1)),
max_depth = 4, params = list(max_depth = 4),
nrounds = 3, nrounds = 3,
objective = "binary:logistic",
nthread = 2 nthread = 2
) )
``` ```
> Note that the parameter `round` is set to `1`. > Note that the parameter `nrounds` is set to `1`.
> [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software. > [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.

View File

@ -146,22 +146,19 @@ In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore,
We will train decision tree model using the following parameters: We will train decision tree model using the following parameters:
* `objective = "binary:logistic"`: we will train a binary classification model ; * `objective = "binary:logistic"`: we will train a binary classification model (note that this is set automatically when `y` is a `factor`) ;
* `max_depth = 2`: the trees won't be deep, because our case is very simple ; * `max_depth = 2`: the trees won't be deep, because our case is very simple ;
* `nthread = 2`: the number of CPU threads we are going to use; * `nthread = 2`: the number of CPU threads we are going to use;
* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction. * `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
```{r trainingSparse, message=F, warning=F} ```{r trainingSparse, message=F, warning=F}
bstSparse <- xgboost( bstSparse <- xgboost(
data = train$data x = train$data
, label = train$label , y = factor(train$label, levels = c(0, 1))
, params = list( , objective = "binary:logistic"
max_depth = 2 , params = list(max_depth = 2, eta = 1)
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2 , nrounds = 2
, nthread = 2
) )
``` ```
@ -175,15 +172,11 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
```{r trainingDense, message=F, warning=F} ```{r trainingDense, message=F, warning=F}
bstDense <- xgboost( bstDense <- xgboost(
data = as.matrix(train$data), x = as.matrix(train$data),
label = train$label, y = factor(train$label, levels = c(0, 1)),
params = list( params = list(max_depth = 2, eta = 1),
max_depth = 2, nrounds = 2,
eta = 1, nthread = 2
nthread = 2,
objective = "binary:logistic"
),
nrounds = 2
) )
``` ```
@ -193,7 +186,7 @@ bstDense <- xgboost(
```{r trainingDmatrix, message=F, warning=F} ```{r trainingDmatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2) dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
bstDMatrix <- xgboost( bstDMatrix <- xgb.train(
data = dtrain, data = dtrain,
params = list( params = list(
max_depth = 2, max_depth = 2,
@ -213,7 +206,7 @@ One of the simplest way to see the training progress is to set the `verbose` opt
```{r trainingVerbose0, message=T, warning=F} ```{r trainingVerbose0, message=T, warning=F}
# verbose = 0, no message # verbose = 0, no message
bst <- xgboost( bst <- xgb.train(
data = dtrain data = dtrain
, params = list( , params = list(
max_depth = 2 max_depth = 2
@ -228,7 +221,7 @@ bst <- xgboost(
```{r trainingVerbose1, message=T, warning=F} ```{r trainingVerbose1, message=T, warning=F}
# verbose = 1, print evaluation metric # verbose = 1, print evaluation metric
bst <- xgboost( bst <- xgb.train(
data = dtrain data = dtrain
, params = list( , params = list(
max_depth = 2 max_depth = 2
@ -243,7 +236,7 @@ bst <- xgboost(
```{r trainingVerbose2, message=T, warning=F} ```{r trainingVerbose2, message=T, warning=F}
# verbose = 2, also print information about tree # verbose = 2, also print information about tree
bst <- xgboost( bst <- xgb.train(
data = dtrain data = dtrain
, params = list( , params = list(
max_depth = 2 max_depth = 2

View File

@ -178,9 +178,10 @@ parameter:
Using feature name instead Using feature name instead
************************** **************************
XGBoost's Python package supports using feature names instead of feature index for XGBoost's Python and R packages support using feature names instead of feature index for
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
feature interaction constraint can be specified as ``[["f0", "f2"]]``. feature interaction constraint can be specified as ``[["f0", "f2"]]`` (Python) or
``list(c("f0", "f2"))`` (R, when passing them to function ``xgboost()``).
************** **************
Advanced topic Advanced topic

View File

@ -97,7 +97,8 @@ Some other examples:
Using feature names Using feature names
******************* *******************
XGBoost's Python package supports using feature names instead of feature index for XGBoost's Python and R packages support using feature names instead of feature indices for
specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
monotonic constraint can be specified as ``{"f0": 1, "f2": -1}``, and ``"f1"`` will monotonic constraint can be specified as ``{"f0": 1, "f2": -1}`` (Python) or as
``list(f0=1, f2=-1)`` (R, when using 'xgboost()', but not 'xgb.train'), and ``"f1"`` will
default to ``0`` (no constraint). default to ``0`` (no constraint).