Support dataframe data format in native XGBoost. (#9828)
- Implement a columnar adapter. - Refactor Python pandas handling code to avoid converting into a single numpy array. - Add support in R for transforming columns. - Support R data.frame and factor type.
This commit is contained in:
@@ -19,7 +19,8 @@
|
||||
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
||||
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||
#' @param feature_names Set names for features.
|
||||
#' @param feature_names Set names for features. Overrides column names in data
|
||||
#' frame and matrix.
|
||||
#' @param nthread Number of threads used for creating DMatrix.
|
||||
#' @param group Group size for all ranking group.
|
||||
#' @param qid Query ID for data samples, used for ranking.
|
||||
@@ -32,6 +33,8 @@
|
||||
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
#' from the original source of data.
|
||||
#' @param enable_categorical Experimental support of specializing for
|
||||
#' categorical features. JSON/UBJSON serialization format is required.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@@ -58,19 +61,26 @@ xgb.DMatrix <- function(
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
) {
|
||||
if (!is.null(group) && !is.null(qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
}
|
||||
ctypes <- NULL
|
||||
if (typeof(data) == "character") {
|
||||
if (length(data) > 1)
|
||||
stop("'data' has class 'character' and length ", length(data),
|
||||
".\n 'data' accepts either a numeric matrix or a single filename.")
|
||||
if (length(data) > 1) {
|
||||
stop(
|
||||
"'data' has class 'character' and length ", length(data),
|
||||
".\n 'data' accepts either a numeric matrix or a single filename."
|
||||
)
|
||||
}
|
||||
data <- path.expand(data)
|
||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else if (inherits(data, "dgCMatrix")) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromCSC_R,
|
||||
@@ -103,6 +113,39 @@ xgb.DMatrix <- function(
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else if (is.data.frame(data)) {
|
||||
ctypes <- sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
if (!enable_categorical) {
|
||||
stop(
|
||||
"When factor type is used, the parameter `enable_categorical`",
|
||||
" must be set to TRUE."
|
||||
)
|
||||
}
|
||||
"c"
|
||||
} else if (is.integer(x)) {
|
||||
"int"
|
||||
} else if (is.logical(x)) {
|
||||
"i"
|
||||
} else {
|
||||
if (!is.numeric(x)) {
|
||||
stop("Invalid type in dataframe.")
|
||||
}
|
||||
"float"
|
||||
}
|
||||
})
|
||||
## as.data.frame somehow converts integer/logical into real.
|
||||
data <- as.data.frame(sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
## XGBoost uses 0-based indexing.
|
||||
as.numeric(x) - 1
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}))
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else {
|
||||
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
||||
}
|
||||
@@ -137,6 +180,9 @@ xgb.DMatrix <- function(
|
||||
if (!is.null(feature_weights)) {
|
||||
setinfo(dmat, "feature_weights", feature_weights)
|
||||
}
|
||||
if (!is.null(ctypes)) {
|
||||
setinfo(dmat, "feature_type", ctypes)
|
||||
}
|
||||
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
@@ -17,7 +17,8 @@ xgb.DMatrix(
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
@@ -42,7 +43,8 @@ It is useful when a 0 or some other extreme value represents missing values in d
|
||||
|
||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||
|
||||
\item{feature_names}{Set names for features.}
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
|
||||
@@ -55,6 +57,9 @@ It is useful when a 0 or some other extreme value represents missing values in d
|
||||
\item{label_upper_bound}{Upper bound for survival training.}
|
||||
|
||||
\item{feature_weights}{Set feature weights for column sampling.}
|
||||
|
||||
\item{enable_categorical}{Experimental support of specializing for
|
||||
categorical features. JSON/UBJSON serialization format is required.}
|
||||
}
|
||||
\description{
|
||||
Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
||||
|
||||
@@ -41,6 +41,7 @@ extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixNumCol_R(SEXP);
|
||||
extern SEXP XGDMatrixNumRow_R(SEXP);
|
||||
@@ -79,6 +80,7 @@ static const R_CallMethodDef CallEntries[] = {
|
||||
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
|
||||
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
|
||||
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
|
||||
{"XGDMatrixCreateFromDF_R", (DL_FUNC) &XGDMatrixCreateFromDF_R, 3},
|
||||
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
||||
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
||||
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
||||
|
||||
@@ -223,6 +223,69 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
|
||||
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
R_API_BEGIN();
|
||||
|
||||
DMatrixHandle handle;
|
||||
|
||||
auto make_vec = [&](auto const *ptr, std::int32_t len) {
|
||||
auto v = xgboost::linalg::MakeVec(ptr, len);
|
||||
return xgboost::linalg::ArrayInterface(v);
|
||||
};
|
||||
|
||||
std::int32_t rc{0};
|
||||
{
|
||||
using xgboost::Json;
|
||||
auto n_features = Rf_xlength(df);
|
||||
std::vector<Json> array(n_features);
|
||||
CHECK_GT(n_features, 0);
|
||||
auto len = Rf_xlength(VECTOR_ELT(df, 0));
|
||||
// The `data.frame` in R actually converts all data into numeric. The other type
|
||||
// handlers here are not used. At the moment they are kept as a reference for when we
|
||||
// can avoid making data copies during transformation.
|
||||
for (decltype(n_features) i = 0; i < n_features; ++i) {
|
||||
switch (TYPEOF(VECTOR_ELT(df, i))) {
|
||||
case INTSXP: {
|
||||
auto const *ptr = INTEGER(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
case REALSXP: {
|
||||
auto const *ptr = REAL(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
case LGLSXP: {
|
||||
auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG(FATAL) << "data.frame has unsupported type.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Json jinterface{std::move(array)};
|
||||
auto sinterface = Json::Dump(jinterface);
|
||||
Json jconfig{xgboost::Object{}};
|
||||
jconfig["missing"] = asReal(missing);
|
||||
jconfig["nthread"] = asInteger(n_threads);
|
||||
auto sconfig = Json::Dump(jconfig);
|
||||
|
||||
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
|
||||
}
|
||||
|
||||
CHECK_CALL(rc);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
namespace {
|
||||
void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
|
||||
std::string *indices_str, std::string *data_str) {
|
||||
@@ -298,6 +361,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
|
||||
res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
|
||||
config.c_str(), &handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
|
||||
@@ -53,6 +53,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
|
||||
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
SEXP missing,
|
||||
SEXP n_threads);
|
||||
|
||||
/**
|
||||
* @brief Create matrix content from a data frame.
|
||||
* @param data R data.frame object
|
||||
* @param missing which value to represent missing value
|
||||
* @param n_threads Number of threads used to construct DMatrix from dense matrix.
|
||||
* @return created dmatrix
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads);
|
||||
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param indptr pointer to column headers
|
||||
|
||||
@@ -322,3 +322,30 @@ test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors",
|
||||
expected_gr <- c(0, 20, 40, 100)
|
||||
expect_equal(info_gr, expected_gr)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: data.frame", {
|
||||
df <- data.frame(
|
||||
a = (1:4) / 10,
|
||||
num = c(1, NA, 3, 4),
|
||||
as.int = as.integer(c(1, 2, 3, 4)),
|
||||
lo = c(TRUE, FALSE, NA, TRUE),
|
||||
str.fac = c("a", "b", "d", "c"),
|
||||
as.fac = as.factor(c(3, 5, 8, 11)),
|
||||
stringsAsFactors = TRUE
|
||||
)
|
||||
|
||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
||||
expect_equal(colnames(m), colnames(df))
|
||||
expect_equal(
|
||||
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
||||
)
|
||||
expect_error(xgb.DMatrix(df))
|
||||
|
||||
df <- data.frame(
|
||||
missing = c("a", "b", "d", NA),
|
||||
valid = c("a", "b", "d", "c"),
|
||||
stringsAsFactors = TRUE
|
||||
)
|
||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
||||
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user