diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index febefb757..8a5d66198 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -77,26 +77,45 @@ xgb.get.handle <- function(object) {
#' Predict method for XGBoost model
#'
-#' Predicted values based on either xgboost model or model handle object.
+#' Predict values on data based on xgboost model.
#'
#' @param object Object of class `xgb.Booster`.
-#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
+#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
#' local data file, or `xgb.DMatrix`.
-#' For single-row predictions on sparse data, it is recommended to use the CSR format.
-#' If passing a sparse vector, it will take it as a row vector.
-#' @param missing Only used when input is a dense matrix. Pick a float value that represents
-#' missing values in data (e.g., 0 or some other extreme value).
+#'
+#' For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+#' a sparse vector, it will take it as a row vector.
+#'
+#' Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+#' pass here instead of passing R types like matrices or data frames, as predictions will be
+#' faster on DMatrix.
+#'
+#' If `newdata` is a `data.frame`, be aware that:\itemize{
+#' \item Columns will be converted to numeric if they aren't already, which could potentially make
+#' the operation slower than in an equivalent `matrix` object.
+#' \item The order of the columns must match with that of the data from which the model was fitted
+#' (i.e. columns will not be referenced by their names, just by their order in the data).
+#' \item If the model was fitted to data with categorical columns, these columns must be of
+#' `factor` type here, and must use the same encoding (i.e. have the same levels).
+#' \item If `newdata` contains any `factor` columns, they will be converted to base-0
+#' encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+#' under a column which during training had a different type.
+#' }
+#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
+#'
+#' This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+#' this as an argument to the DMatrix constructor instead.
#' @param outputmargin Whether the prediction should be returned in the form of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
#' logistic regression would return log-odds instead of probabilities.
-#' @param predleaf Whether to predict pre-tree leaf indices.
+#' @param predleaf Whether to predict per-tree leaf indices.
#' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
#' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
#' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
#' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
#' prediction outputs per case. No effect if `predleaf`, `predcontrib`,
#' or `predinteraction` is `TRUE`.
-#' @param training Whether the predictions are used for training. For dart booster,
+#' @param training Whether the prediction result is used for training. For dart booster,
#' training predicting will perform dropout.
#' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
@@ -111,6 +130,12 @@ xgb.get.handle <- function(object) {
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
#' type and shape of predictions are invariant to the model type.
+#' @param base_margin Base margin used for boosting from existing model.
+#'
+#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+#' an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+#'
#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
#' match (only applicable when both `object` and `newdata` have feature names).
#'
@@ -287,16 +312,80 @@ xgb.get.handle <- function(object) {
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
- validate_features = FALSE, ...) {
+ validate_features = FALSE, base_margin = NULL, ...) {
if (validate_features) {
newdata <- validate.features(object, newdata)
}
- if (!inherits(newdata, "xgb.DMatrix")) {
+ is_dmatrix <- inherits(newdata, "xgb.DMatrix")
+ if (is_dmatrix && !is.null(base_margin)) {
+ stop(
+ "'base_margin' is not supported when passing 'xgb.DMatrix' as input.",
+ " Should be passed as argument to 'xgb.DMatrix' constructor."
+ )
+ }
+
+ use_as_df <- FALSE
+ use_as_dense_matrix <- FALSE
+ use_as_csr_matrix <- FALSE
+ n_row <- NULL
+ if (!is_dmatrix) {
+
+ inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf
+ if (inplace_predict_supported) {
+ booster_type <- xgb.booster_type(object)
+ if (booster_type == "gblinear" || (booster_type == "dart" && training)) {
+ inplace_predict_supported <- FALSE
+ }
+ }
+ if (inplace_predict_supported) {
+
+ if (is.matrix(newdata)) {
+ use_as_dense_matrix <- TRUE
+ } else if (is.data.frame(newdata)) {
+ # note: since here it turns it into a non-data-frame list,
+ # needs to keep track of the number of rows it had for later
+ n_row <- nrow(newdata)
+ newdata <- lapply(
+ newdata,
+ function(x) {
+ if (is.factor(x)) {
+ return(as.numeric(x) - 1)
+ } else {
+ return(as.numeric(x))
+ }
+ }
+ )
+ use_as_df <- TRUE
+ } else if (inherits(newdata, "dgRMatrix")) {
+ use_as_csr_matrix <- TRUE
+ csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata))
+ } else if (inherits(newdata, "dsparseVector")) {
+ use_as_csr_matrix <- TRUE
+ n_row <- 1L
+ i <- newdata@i - 1L
+ if (storage.mode(i) != "integer") {
+ storage.mode(i) <- "integer"
+ }
+ csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata))
+ }
+
+ }
+
+ } # if (!is_dmatrix)
+
+ if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) {
nthread <- xgb.nthread(object)
newdata <- xgb.DMatrix(
newdata,
- missing = missing, nthread = NVL(nthread, -1)
+ missing = missing,
+ base_margin = base_margin,
+ nthread = NVL(nthread, -1)
)
+ is_dmatrix <- TRUE
+ }
+
+ if (is.null(n_row)) {
+ n_row <- nrow(newdata)
}
@@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
args$type <- set_type(6)
}
- predts <- .Call(
- XGBoosterPredictFromDMatrix_R,
- xgb.get.handle(object),
- newdata,
- jsonlite::toJSON(args, auto_unbox = TRUE)
- )
+ json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
+ if (is_dmatrix) {
+ predts <- .Call(
+ XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
+ )
+ } else if (use_as_dense_matrix) {
+ predts <- .Call(
+ XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+ )
+ } else if (use_as_csr_matrix) {
+ predts <- .Call(
+ XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
+ )
+ } else if (use_as_df) {
+ predts <- .Call(
+ XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+ )
+ }
+
names(predts) <- c("shape", "results")
shape <- predts$shape
arr <- predts$results
n_ret <- length(arr)
- n_row <- nrow(newdata)
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
}
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 95e7a51fd..88a2f203e 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -18,25 +18,47 @@
iterationrange = NULL,
strict_shape = FALSE,
validate_features = FALSE,
+ base_margin = NULL,
...
)
}
\arguments{
\item{object}{Object of class \code{xgb.Booster}.}
-\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
+\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
local data file, or \code{xgb.DMatrix}.
-For single-row predictions on sparse data, it is recommended to use the CSR format.
-If passing a sparse vector, it will take it as a row vector.}
-\item{missing}{Only used when input is a dense matrix. Pick a float value that represents
-missing values in data (e.g., 0 or some other extreme value).}
+\if{html}{\out{
}}\preformatted{ For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+ a sparse vector, it will take it as a row vector.
+
+ Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+ pass here instead of passing R types like matrices or data frames, as predictions will be
+ faster on DMatrix.
+
+ If `newdata` is a `data.frame`, be aware that:\\itemize\{
+ \\item Columns will be converted to numeric if they aren't already, which could potentially make
+ the operation slower than in an equivalent `matrix` object.
+ \\item The order of the columns must match with that of the data from which the model was fitted
+ (i.e. columns will not be referenced by their names, just by their order in the data).
+ \\item If the model was fitted to data with categorical columns, these columns must be of
+ `factor` type here, and must use the same encoding (i.e. have the same levels).
+ \\item If `newdata` contains any `factor` columns, they will be converted to base-0
+ encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+ under a column which during training had a different type.
+ \}
+}\if{html}{\out{
}}}
+
+\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
+
+\if{html}{\out{}}\preformatted{ This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+ this as an argument to the DMatrix constructor instead.
+}\if{html}{\out{
}}}
\item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
logistic regression would return log-odds instead of probabilities.}
-\item{predleaf}{Whether to predict pre-tree leaf indices.}
+\item{predleaf}{Whether to predict per-tree leaf indices.}
\item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
@@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.}
prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
or \code{predinteraction} is \code{TRUE}.}
-\item{training}{Whether the predictions are used for training. For dart booster,
+\item{training}{Whether the prediction result is used for training. For dart booster,
training predicting will perform dropout.}
\item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
@@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n
recommended to disable it for performance-sensitive applications.
}\if{html}{\out{}}}
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{}}\preformatted{ Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+ be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+ an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+}\if{html}{\out{
}}}
+
\item{...}{Not used.}
}
\value{
@@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array:
}
}
\description{
-Predicted values based on either xgboost model or model handle object.
+Predict values on data based on xgboost model.
}
\details{
Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 36f3e8953..f2635742e 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value);
extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@@ -96,6 +99,9 @@ static const R_CallMethodDef CallEntries[] = {
{"XGBoosterSerializeToBuffer_R", (DL_FUNC) &XGBoosterSerializeToBuffer_R, 1},
{"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
{"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
+ {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5},
+ {"XGBoosterPredictFromCSR_R", (DL_FUNC) &XGBoosterPredictFromCSR_R, 5},
+ {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5},
{"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2},
{"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3},
{"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 4192f82fb..5baf8d412 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -13,6 +13,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
return xgboost::Json::Dump(jinterface);
}
-[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
- using namespace ::xgboost; // NOLINT
- Json jconfig{Object{}};
-
- const SEXPTYPE missing_type = TYPEOF(missing);
- if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
- (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
- (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) {
+ if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) {
// missing is not specified
if (arr_type == REALSXP) {
- jconfig["missing"] = std::numeric_limits::quiet_NaN();
+ (*jconfig)["missing"] = std::numeric_limits::quiet_NaN();
} else {
- jconfig["missing"] = R_NaInt;
+ (*jconfig)["missing"] = R_NaInt;
}
} else {
// missing specified
- jconfig["missing"] = Rf_asReal(missing);
+ (*jconfig)["missing"] = Rf_asReal(missing);
}
+}
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+ using namespace ::xgboost; // NOLINT
+ Json jconfig{Object{}};
+ AddMissingToJson(&jconfig, missing, arr_type);
jconfig["nthread"] = Rf_asInteger(n_threads);
return Json::Dump(jconfig);
}
@@ -411,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
DMatrixHandle handle;
std::int32_t rc{0};
{
- std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
+ const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
xgboost::Json jconfig{xgboost::Object{}};
jconfig["missing"] = asReal(missing);
jconfig["nthread"] = asInteger(n_threads);
@@ -463,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
Json jconfig{Object{}};
// Construct configuration
jconfig["nthread"] = Integer{threads};
- jconfig["missing"] = xgboost::Number{asReal(missing)};
+ AddMissingToJson(&jconfig, missing, TYPEOF(data));
std::string config;
Json::Dump(jconfig, &config);
res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
@@ -498,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
Json jconfig{Object{}};
// Construct configuration
jconfig["nthread"] = Integer{threads};
- jconfig["missing"] = xgboost::Number{asReal(missing)};
+ AddMissingToJson(&jconfig, missing, TYPEOF(data));
std::string config;
Json::Dump(jconfig, &config);
res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
@@ -1247,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
return mkString(ret);
}
-XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) {
+namespace {
+
+struct ProxyDmatrixError : public std::exception {};
+
+struct ProxyDmatrixWrapper {
+ DMatrixHandle proxy_dmat_handle;
+
+ ProxyDmatrixWrapper() {
+ int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle);
+ if (res_code != 0) {
+ throw ProxyDmatrixError();
+ }
+ }
+
+ ~ProxyDmatrixWrapper() {
+ if (this->proxy_dmat_handle) {
+ XGDMatrixFree(this->proxy_dmat_handle);
+ this->proxy_dmat_handle = nullptr;
+ }
+ }
+
+ DMatrixHandle get_handle() {
+ return this->proxy_dmat_handle;
+ }
+};
+
+std::unique_ptr GetProxyDMatrixWithBaseMargin(SEXP base_margin) {
+ if (Rf_isNull(base_margin)) {
+ return std::unique_ptr(nullptr);
+ }
+
+ SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol);
+ int res_code;
+ try {
+ const std::string array_str = Rf_isNull(base_margin_dim)?
+ MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin);
+ std::unique_ptr proxy_dmat(new ProxyDmatrixWrapper());
+ res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(),
+ "base_margin",
+ array_str.c_str());
+ if (res_code != 0) {
+ throw ProxyDmatrixError();
+ }
+ return proxy_dmat;
+ } catch(ProxyDmatrixError &err) {
+ Rf_error("%s", XGBGetLastError());
+ }
+}
+
+enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame};
+
+SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
+ PredictionInputType input_type, SEXP missing,
+ SEXP base_margin) {
SEXP r_out_shape;
SEXP r_out_result;
SEXP r_out = PROTECT(allocVector(VECSXP, 2));
@@ -1259,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
bst_ulong out_dim;
bst_ulong const *out_shape;
float const *out_result;
- CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
- R_ExternalPtrAddr(dmat), c_json_config,
- &out_shape, &out_dim, &out_result));
+
+ int res_code;
+ {
+ switch (input_type) {
+ case PredictionInputType::DMatrix: {
+ res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
+ R_ExternalPtrAddr(input_data), c_json_config,
+ &out_shape, &out_dim, &out_result);
+ break;
+ }
+
+ case PredictionInputType::CSRMatrix: {
+ std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin(
+ base_margin);
+ DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+ SEXP indptr = VECTOR_ELT(input_data, 0);
+ SEXP indices = VECTOR_ELT(input_data, 1);
+ SEXP data = VECTOR_ELT(input_data, 2);
+ const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3));
+ const SEXPTYPE type_data = TYPEOF(data);
+ CHECK_EQ(type_data, REALSXP);
+ std::string sindptr, sindices, sdata;
+ CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+
+ xgboost::StringView json_str(c_json_config);
+ xgboost::Json new_json = xgboost::Json::Load(json_str);
+ AddMissingToJson(&new_json, missing, type_data);
+ const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+ res_code = XGBoosterPredictFromCSR(
+ R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(),
+ ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+ break;
+ }
+
+ case PredictionInputType::DenseMatrix: {
+ std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin(
+ base_margin);
+ DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+ const std::string array_str = MakeArrayInterfaceFromRMat(input_data);
+
+ xgboost::StringView json_str(c_json_config);
+ xgboost::Json new_json = xgboost::Json::Load(json_str);
+ AddMissingToJson(&new_json, missing, TYPEOF(input_data));
+ const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+ res_code = XGBoosterPredictFromDense(
+ R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(),
+ proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+ break;
+ }
+
+ case PredictionInputType::DataFrame: {
+ std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin(
+ base_margin);
+ DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+ const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data);
+
+ xgboost::StringView json_str(c_json_config);
+ xgboost::Json new_json = xgboost::Json::Load(json_str);
+ AddMissingToJson(&new_json, missing, REALSXP);
+ const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+ res_code = XGBoosterPredictFromColumnar(
+ R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(),
+ proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+ break;
+ }
+ }
+ }
+ CHECK_CALL(res_code);
r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
size_t len = 1;
@@ -1282,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
return r_out;
}
+} // namespace
+
+XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) {
+ return XGBoosterPredictGeneric(handle, dmat, json_config,
+ PredictionInputType::DMatrix, R_NilValue, R_NilValue);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+ SEXP json_config, SEXP base_margin) {
+ return XGBoosterPredictGeneric(handle, R_mat, json_config,
+ PredictionInputType::DenseMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+ SEXP json_config, SEXP base_margin) {
+ return XGBoosterPredictGeneric(handle, lst, json_config,
+ PredictionInputType::CSRMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+ SEXP json_config, SEXP base_margin) {
+ return XGBoosterPredictGeneric(handle, R_df, json_config,
+ PredictionInputType::DataFrame, missing, base_margin);
+}
+
XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
R_API_BEGIN();
CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 652345e52..70fd885e7 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -371,6 +371,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
* \return A list containing 2 vectors, first one for shape while second one for prediction result.
*/
XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
+
+/*!
+ * \brief Run prediction on R dense matrix
+ * \param handle handle
+ * \param R_mat R matrix
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+ SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R CSR matrix
+ * \param handle handle
+ * \param lst An R list, containing, in this order:
+ * (a) 'p' array (a.k.a. indptr)
+ * (b) 'j' array (a.k.a. indices)
+ * (c) 'x' array (a.k.a. data / values)
+ * (d) number of columns
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+ SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R data.frame
+ * \param handle handle
+ * \param R_df R data.frame
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+ SEXP json_config, SEXP base_margin);
+
/*!
* \brief load model from existing file
* \param handle handle
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index fb3162e42..5438c8bb2 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -139,8 +139,8 @@ test_that("dart prediction works", {
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
- expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
- expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
+ expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6)
+ expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6)
expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
})
@@ -651,6 +651,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
expect_equal(pred_qid, pred_gr)
})
+test_that("Can predict on data.frame objects", {
+ data("mtcars")
+ y <- mtcars$mpg
+ x_df <- mtcars[, -1]
+ x_mat <- as.matrix(x_df)
+ dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads)
+ model <- xgb.train(
+ params = list(
+ tree_method = "hist",
+ objective = "reg:squarederror",
+ nthread = n_threads
+ ),
+ data = dm,
+ nrounds = 5
+ )
+
+ pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads)
+ pred_df <- predict(model, x_df, nthread = n_threads)
+ expect_equal(pred_mat, pred_df)
+})
+
+test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", {
+ data("mtcars")
+ y <- mtcars$mpg
+ x <- as.matrix(mtcars[, -1])
+ dm <- xgb.DMatrix(x, label = y, nthread = n_threads)
+ model <- xgb.train(
+ params = list(
+ tree_method = "hist",
+ objective = "reg:squarederror",
+ nthread = n_threads
+ ),
+ data = dm,
+ nrounds = 5
+ )
+
+ set.seed(123)
+ base_margin <- rnorm(nrow(x))
+ dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin)
+ pred_from_dm <- predict(model, dm_w_base)
+ pred_from_mat <- predict(model, x, base_margin = base_margin)
+
+ expect_equal(pred_from_dm, pred_from_mat)
+})
+
test_that("Coefficients from gblinear have the expected shape and names", {
# Single-column coefficients
data(mtcars)
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 45bcac08d..061240644 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", {
file.remove(fname_nan)
})
+test_that("xgb.DMatrix: missing in CSR", {
+ x_dense <- matrix(as.numeric(1:10), nrow = 5)
+ x_dense[2, 1] <- NA_real_
+
+ x_csr <- as(x_dense, "RsparseMatrix")
+
+ m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_)
+ xgb.DMatrix.save(m_dense, "dense.dmatrix")
+
+ m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA)
+ xgb.DMatrix.save(m_csr, "csr.dmatrix")
+
+ denseconn <- file("dense.dmatrix", "rb")
+ csrconn <- file("csr.dmatrix", "rb")
+
+ expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix"))
+
+ bytes <- file.size("dense.dmatrix")
+ densedmatrix <- readBin(denseconn, "raw", n = bytes)
+ csrmatrix <- readBin(csrconn, "raw", n = bytes)
+
+ expect_equal(length(densedmatrix), length(csrmatrix))
+ expect_equal(densedmatrix, csrmatrix)
+
+ close(denseconn)
+ close(csrconn)
+
+ file.remove("dense.dmatrix")
+ file.remove("csr.dmatrix")
+})
+
test_that("xgb.DMatrix: error on three-dimensional array", {
set.seed(123)
x <- matrix(rnorm(500), nrow = 50)