Support dataframe data format in native XGBoost. (#9828)

- Implement a columnar adapter.
- Refactor Python pandas handling code to avoid converting into a single numpy array.
- Add support in R for transforming columns.
- Support R data.frame and factor type.
This commit is contained in:
Jiaming Yuan
2023-12-12 09:56:31 +08:00
committed by GitHub
parent b3700bbb3f
commit faf0f2df10
21 changed files with 718 additions and 221 deletions

View File

@@ -19,7 +19,8 @@
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param feature_names Set names for features.
#' @param feature_names Set names for features. Overrides column names in data
#' frame and matrix.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
@@ -32,6 +33,8 @@
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
#' from the original source of data.
#' @param enable_categorical Experimental support of specializing for
#' categorical features. JSON/UBJSON serialization format is required.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
@@ -58,19 +61,26 @@ xgb.DMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
feature_weights = NULL,
enable_categorical = FALSE
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
ctypes <- NULL
if (typeof(data) == "character") {
if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.")
if (length(data) > 1) {
stop(
"'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename."
)
}
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
)
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(
XGDMatrixCreateFromCSC_R,
@@ -103,6 +113,39 @@ xgb.DMatrix <- function(
missing,
as.integer(NVL(nthread, -1))
)
} else if (is.data.frame(data)) {
ctypes <- sapply(data, function(x) {
if (is.factor(x)) {
if (!enable_categorical) {
stop(
"When factor type is used, the parameter `enable_categorical`",
" must be set to TRUE."
)
}
"c"
} else if (is.integer(x)) {
"int"
} else if (is.logical(x)) {
"i"
} else {
if (!is.numeric(x)) {
stop("Invalid type in dataframe.")
}
"float"
}
})
## as.data.frame somehow converts integer/logical into real.
data <- as.data.frame(sapply(data, function(x) {
if (is.factor(x)) {
## XGBoost uses 0-based indexing.
as.numeric(x) - 1
} else {
x
}
}))
handle <- .Call(
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
)
} else {
stop("xgb.DMatrix does not support construction from ", typeof(data))
}
@@ -137,6 +180,9 @@ xgb.DMatrix <- function(
if (!is.null(feature_weights)) {
setinfo(dmat, "feature_weights", feature_weights)
}
if (!is.null(ctypes)) {
setinfo(dmat, "feature_type", ctypes)
}
return(dmat)
}