[R] Add missing DMatrix functions (#9929)
* `XGDMatrixGetQuantileCut` * `XGDMatrixNumNonMissing` * `XGDMatrixGetDataAsCSR` --------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
parent
49247458f9
commit
3c004a4145
@ -37,6 +37,9 @@ export(xgb.create.features)
|
|||||||
export(xgb.cv)
|
export(xgb.cv)
|
||||||
export(xgb.dump)
|
export(xgb.dump)
|
||||||
export(xgb.gblinear.history)
|
export(xgb.gblinear.history)
|
||||||
|
export(xgb.get.DMatrix.data)
|
||||||
|
export(xgb.get.DMatrix.num.non.missing)
|
||||||
|
export(xgb.get.DMatrix.qcut)
|
||||||
export(xgb.get.config)
|
export(xgb.get.config)
|
||||||
export(xgb.ggplot.deepness)
|
export(xgb.ggplot.deepness)
|
||||||
export(xgb.ggplot.importance)
|
export(xgb.ggplot.importance)
|
||||||
@ -60,6 +63,7 @@ export(xgb.unserialize)
|
|||||||
export(xgboost)
|
export(xgboost)
|
||||||
import(methods)
|
import(methods)
|
||||||
importClassesFrom(Matrix,dgCMatrix)
|
importClassesFrom(Matrix,dgCMatrix)
|
||||||
|
importClassesFrom(Matrix,dgRMatrix)
|
||||||
importClassesFrom(Matrix,dgeMatrix)
|
importClassesFrom(Matrix,dgeMatrix)
|
||||||
importFrom(Matrix,colSums)
|
importFrom(Matrix,colSums)
|
||||||
importFrom(Matrix,sparse.model.matrix)
|
importFrom(Matrix,sparse.model.matrix)
|
||||||
@ -83,6 +87,7 @@ importFrom(graphics,points)
|
|||||||
importFrom(graphics,title)
|
importFrom(graphics,title)
|
||||||
importFrom(jsonlite,fromJSON)
|
importFrom(jsonlite,fromJSON)
|
||||||
importFrom(jsonlite,toJSON)
|
importFrom(jsonlite,toJSON)
|
||||||
|
importFrom(methods,new)
|
||||||
importFrom(stats,median)
|
importFrom(stats,median)
|
||||||
importFrom(stats,predict)
|
importFrom(stats,predict)
|
||||||
importFrom(utils,head)
|
importFrom(utils,head)
|
||||||
|
|||||||
@ -526,6 +526,111 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
|
|||||||
stop("setinfo: unknown info name ", name)
|
stop("setinfo: unknown info name ", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#' @title Get Quantile Cuts from DMatrix
|
||||||
|
#' @description Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
|
||||||
|
#' that has been quantized for the histogram method (`tree_method="hist"`).
|
||||||
|
#'
|
||||||
|
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||||
|
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
|
||||||
|
#' As such, the first and last bin will be outside of the range of the data, so as to include
|
||||||
|
#' all of the observations there.
|
||||||
|
#'
|
||||||
|
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
|
||||||
|
#' which will be output in sorted order from lowest to highest.
|
||||||
|
#'
|
||||||
|
#' Different columns can have different numbers of bins according to their range.
|
||||||
|
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||||
|
#' @param output Output format for the quantile cuts. Possible options are:\itemize{
|
||||||
|
#' \item `"list"` will return the output as a list with one entry per column, where
|
||||||
|
#' each column will have a numeric vector with the cuts. The list will be named if
|
||||||
|
#' `dmat` has column names assigned to it.
|
||||||
|
#' \item `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
|
||||||
|
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||||
|
#' `indptr[i]+1` to `indptr[i+1]`.
|
||||||
|
#' }
|
||||||
|
#' @return The quantile cuts, in the format specified by parameter `output`.
|
||||||
|
#' @examples
|
||||||
|
#' library(xgboost)
|
||||||
|
#' data(mtcars)
|
||||||
|
#' y <- mtcars$mpg
|
||||||
|
#' x <- as.matrix(mtcars[, -1])
|
||||||
|
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||||
|
#'
|
||||||
|
#' # DMatrix is not quantized right away, but will be once a hist model is generated
|
||||||
|
#' model <- xgb.train(
|
||||||
|
#' data = dm,
|
||||||
|
#' params = list(
|
||||||
|
#' tree_method = "hist",
|
||||||
|
#' max_bin = 8,
|
||||||
|
#' nthread = 1
|
||||||
|
#' ),
|
||||||
|
#' nrounds = 3
|
||||||
|
#' )
|
||||||
|
#'
|
||||||
|
#' # Now can get the quantile cuts
|
||||||
|
#' xgb.get.DMatrix.qcut(dm)
|
||||||
|
#' @export
|
||||||
|
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
|
||||||
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||||
|
output <- head(output, 1L)
|
||||||
|
stopifnot(output %in% c("list", "arrays"))
|
||||||
|
res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
|
||||||
|
if (output == "arrays") {
|
||||||
|
return(res)
|
||||||
|
} else {
|
||||||
|
feature_names <- getinfo(dmat, "feature_name")
|
||||||
|
ncols <- length(res$indptr) - 1
|
||||||
|
out <- lapply(
|
||||||
|
seq(1, ncols),
|
||||||
|
function(col) {
|
||||||
|
st <- res$indptr[col]
|
||||||
|
end <- res$indptr[col + 1]
|
||||||
|
if (end <= st) {
|
||||||
|
return(numeric())
|
||||||
|
}
|
||||||
|
return(res$data[seq(1 + st, end)])
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if (NROW(feature_names)) {
|
||||||
|
names(out) <- feature_names
|
||||||
|
}
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @title Get Number of Non-Missing Entries in DMatrix
|
||||||
|
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||||
|
#' @return The number of non-missing entries in the DMatrix
|
||||||
|
#' @export
|
||||||
|
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
|
||||||
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||||
|
return(.Call(XGDMatrixNumNonMissing_R, dmat))
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @title Get DMatrix Data
|
||||||
|
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||||
|
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
|
||||||
|
#' from package `Matrix`). If it had feature names, these will be added as column names
|
||||||
|
#' in the output.
|
||||||
|
#' @export
|
||||||
|
xgb.get.DMatrix.data <- function(dmat) {
|
||||||
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||||
|
res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
|
||||||
|
out <- methods::new("dgRMatrix")
|
||||||
|
nrows <- as.integer(length(res$indptr) - 1)
|
||||||
|
out@p <- res$indptr
|
||||||
|
out@j <- res$indices
|
||||||
|
out@x <- res$data
|
||||||
|
out@Dim <- as.integer(c(nrows, res$ncols))
|
||||||
|
|
||||||
|
feature_names <- getinfo(dmat, "feature_name")
|
||||||
|
dim_names <- list(NULL, NULL)
|
||||||
|
if (NROW(feature_names)) {
|
||||||
|
dim_names[[2L]] <- feature_names
|
||||||
|
}
|
||||||
|
out@Dimnames <- dim_names
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
#' Get a new DMatrix containing the specified rows of
|
#' Get a new DMatrix containing the specified rows of
|
||||||
#' original xgb.DMatrix object
|
#' original xgb.DMatrix object
|
||||||
|
|||||||
@ -82,7 +82,7 @@ NULL
|
|||||||
NULL
|
NULL
|
||||||
|
|
||||||
# Various imports
|
# Various imports
|
||||||
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
|
||||||
#' @importFrom Matrix colSums
|
#' @importFrom Matrix colSums
|
||||||
#' @importFrom Matrix sparse.model.matrix
|
#' @importFrom Matrix sparse.model.matrix
|
||||||
#' @importFrom Matrix sparseVector
|
#' @importFrom Matrix sparseVector
|
||||||
@ -98,6 +98,7 @@ NULL
|
|||||||
#' @importFrom data.table setnames
|
#' @importFrom data.table setnames
|
||||||
#' @importFrom jsonlite fromJSON
|
#' @importFrom jsonlite fromJSON
|
||||||
#' @importFrom jsonlite toJSON
|
#' @importFrom jsonlite toJSON
|
||||||
|
#' @importFrom methods new
|
||||||
#' @importFrom utils object.size str tail
|
#' @importFrom utils object.size str tail
|
||||||
#' @importFrom stats predict
|
#' @importFrom stats predict
|
||||||
#' @importFrom stats median
|
#' @importFrom stats median
|
||||||
|
|||||||
19
R-package/man/xgb.get.DMatrix.data.Rd
Normal file
19
R-package/man/xgb.get.DMatrix.data.Rd
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.get.DMatrix.data}
|
||||||
|
\alias{xgb.get.DMatrix.data}
|
||||||
|
\title{Get DMatrix Data}
|
||||||
|
\usage{
|
||||||
|
xgb.get.DMatrix.data(dmat)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
The data held in the DMatrix, as a sparse CSR matrix (class \code{dgRMatrix}
|
||||||
|
from package \code{Matrix}). If it had feature names, these will be added as column names
|
||||||
|
in the output.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Get DMatrix Data
|
||||||
|
}
|
||||||
17
R-package/man/xgb.get.DMatrix.num.non.missing.Rd
Normal file
17
R-package/man/xgb.get.DMatrix.num.non.missing.Rd
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.get.DMatrix.num.non.missing}
|
||||||
|
\alias{xgb.get.DMatrix.num.non.missing}
|
||||||
|
\title{Get Number of Non-Missing Entries in DMatrix}
|
||||||
|
\usage{
|
||||||
|
xgb.get.DMatrix.num.non.missing(dmat)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
The number of non-missing entries in the DMatrix
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Get Number of Non-Missing Entries in DMatrix
|
||||||
|
}
|
||||||
58
R-package/man/xgb.get.DMatrix.qcut.Rd
Normal file
58
R-package/man/xgb.get.DMatrix.qcut.Rd
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.get.DMatrix.qcut}
|
||||||
|
\alias{xgb.get.DMatrix.qcut}
|
||||||
|
\title{Get Quantile Cuts from DMatrix}
|
||||||
|
\usage{
|
||||||
|
xgb.get.DMatrix.qcut(dmat, output = c("list", "arrays"))
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||||
|
|
||||||
|
\item{output}{Output format for the quantile cuts. Possible options are:\itemize{
|
||||||
|
\item \code{"list"} will return the output as a list with one entry per column, where
|
||||||
|
each column will have a numeric vector with the cuts. The list will be named if
|
||||||
|
\code{dmat} has column names assigned to it.
|
||||||
|
\item \code{"arrays"} will return a list with entries \code{indptr} (base-0 indexing) and
|
||||||
|
\code{data}. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||||
|
\code{indptr[i]+1} to \code{indptr[i+1]}.
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
The quantile cuts, in the format specified by parameter \code{output}.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Get the quantile cuts (a.k.a. borders) from an \code{xgb.DMatrix}
|
||||||
|
that has been quantized for the histogram method (\code{tree_method="hist"}).
|
||||||
|
|
||||||
|
These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||||
|
boundaries which are used to determine assignment condition \verb{border_low < x < border_high}.
|
||||||
|
As such, the first and last bin will be outside of the range of the data, so as to include
|
||||||
|
all of the observations there.
|
||||||
|
|
||||||
|
If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
|
||||||
|
which will be output in sorted order from lowest to highest.
|
||||||
|
|
||||||
|
Different columns can have different numbers of bins according to their range.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
library(xgboost)
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars$mpg
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||||
|
|
||||||
|
# DMatrix is not quantized right away, but will be once a hist model is generated
|
||||||
|
model <- xgb.train(
|
||||||
|
data = dm,
|
||||||
|
params = list(
|
||||||
|
tree_method = "hist",
|
||||||
|
max_bin = 8,
|
||||||
|
nthread = 1
|
||||||
|
),
|
||||||
|
nrounds = 3
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now can get the quantile cuts
|
||||||
|
xgb.get.DMatrix.qcut(dm)
|
||||||
|
}
|
||||||
@ -63,6 +63,7 @@ OBJECTS= \
|
|||||||
$(PKGROOT)/src/gbm/gblinear.o \
|
$(PKGROOT)/src/gbm/gblinear.o \
|
||||||
$(PKGROOT)/src/gbm/gblinear_model.o \
|
$(PKGROOT)/src/gbm/gblinear_model.o \
|
||||||
$(PKGROOT)/src/data/adapter.o \
|
$(PKGROOT)/src/data/adapter.o \
|
||||||
|
$(PKGROOT)/src/data/array_interface.o \
|
||||||
$(PKGROOT)/src/data/simple_dmatrix.o \
|
$(PKGROOT)/src/data/simple_dmatrix.o \
|
||||||
$(PKGROOT)/src/data/data.o \
|
$(PKGROOT)/src/data/data.o \
|
||||||
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
||||||
|
|||||||
@ -63,6 +63,7 @@ OBJECTS= \
|
|||||||
$(PKGROOT)/src/gbm/gblinear.o \
|
$(PKGROOT)/src/gbm/gblinear.o \
|
||||||
$(PKGROOT)/src/gbm/gblinear_model.o \
|
$(PKGROOT)/src/gbm/gblinear_model.o \
|
||||||
$(PKGROOT)/src/data/adapter.o \
|
$(PKGROOT)/src/data/adapter.o \
|
||||||
|
$(PKGROOT)/src/data/array_interface.o \
|
||||||
$(PKGROOT)/src/data/simple_dmatrix.o \
|
$(PKGROOT)/src/data/simple_dmatrix.o \
|
||||||
$(PKGROOT)/src/data/data.o \
|
$(PKGROOT)/src/data/data.o \
|
||||||
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
||||||
|
|||||||
@ -45,6 +45,9 @@ extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
|
|||||||
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
||||||
extern SEXP XGDMatrixNumCol_R(SEXP);
|
extern SEXP XGDMatrixNumCol_R(SEXP);
|
||||||
extern SEXP XGDMatrixNumRow_R(SEXP);
|
extern SEXP XGDMatrixNumRow_R(SEXP);
|
||||||
|
extern SEXP XGDMatrixGetQuantileCut_R(SEXP);
|
||||||
|
extern SEXP XGDMatrixNumNonMissing_R(SEXP);
|
||||||
|
extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
|
||||||
extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
|
extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
|
||||||
extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
|
extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
|
||||||
extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
|
extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
|
||||||
@ -84,6 +87,9 @@ static const R_CallMethodDef CallEntries[] = {
|
|||||||
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
||||||
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
||||||
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
||||||
|
{"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1},
|
||||||
|
{"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1},
|
||||||
|
{"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1},
|
||||||
{"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3},
|
{"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3},
|
||||||
{"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3},
|
{"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3},
|
||||||
{"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
|
{"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2014-2023 by XGBoost Contributors
|
* Copyright 2014-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <dmlc/common.h>
|
#include <dmlc/common.h>
|
||||||
#include <dmlc/omp.h>
|
#include <dmlc/omp.h>
|
||||||
@ -9,9 +9,11 @@
|
|||||||
#include <xgboost/logging.h>
|
#include <xgboost/logging.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
@ -20,14 +22,14 @@
|
|||||||
#include "../../src/c_api/c_api_error.h"
|
#include "../../src/c_api/c_api_error.h"
|
||||||
#include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr
|
#include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr
|
||||||
#include "../../src/common/threading_utils.h"
|
#include "../../src/common/threading_utils.h"
|
||||||
|
#include "../../src/data/array_interface.h" // for ArrayInterface
|
||||||
|
|
||||||
#include "./xgboost_R.h" // Must follow other includes.
|
#include "./xgboost_R.h" // Must follow other includes.
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
struct ErrorWithUnwind : public std::exception {};
|
struct ErrorWithUnwind : public std::exception {};
|
||||||
|
|
||||||
void ThrowExceptionFromRError(void *unused, Rboolean jump) {
|
void ThrowExceptionFromRError(void *, Rboolean jump) {
|
||||||
if (jump) {
|
if (jump) {
|
||||||
throw ErrorWithUnwind();
|
throw ErrorWithUnwind();
|
||||||
}
|
}
|
||||||
@ -49,6 +51,30 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
|
|||||||
continuation_token);
|
continuation_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SEXP WrappedAllocReal(void *void_ptr) {
|
||||||
|
size_t *size = static_cast<size_t*>(void_ptr);
|
||||||
|
return Rf_allocVector(REALSXP, *size);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP SafeAllocReal(size_t size, SEXP continuation_token) {
|
||||||
|
return R_UnwindProtect(
|
||||||
|
WrappedAllocReal, static_cast<void*>(&size),
|
||||||
|
ThrowExceptionFromRError, nullptr,
|
||||||
|
continuation_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP WrappedAllocInteger(void *void_ptr) {
|
||||||
|
size_t *size = static_cast<size_t*>(void_ptr);
|
||||||
|
return Rf_allocVector(INTSXP, *size);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
|
||||||
|
return R_UnwindProtect(
|
||||||
|
WrappedAllocInteger, static_cast<void*>(&size),
|
||||||
|
ThrowExceptionFromRError, nullptr,
|
||||||
|
continuation_token);
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
|
[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
|
||||||
SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
|
SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
|
||||||
if (Rf_xlength(mat_dims) > 2) {
|
if (Rf_xlength(mat_dims) > 2) {
|
||||||
@ -136,6 +162,37 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
|
|||||||
jconfig["nthread"] = Rf_asInteger(n_threads);
|
jconfig["nthread"] = Rf_asInteger(n_threads);
|
||||||
return Json::Dump(jconfig);
|
return Json::Dump(jconfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allocate a R vector and copy an array interface encoded object to it.
|
||||||
|
[[nodiscard]] SEXP CopyArrayToR(const char *array_str, SEXP ctoken) {
|
||||||
|
xgboost::ArrayInterface<1> array{xgboost::StringView{array_str}};
|
||||||
|
// R supports only int and double.
|
||||||
|
bool is_int =
|
||||||
|
xgboost::DispatchDType(array.type, [](auto t) { return std::is_integral_v<decltype(t)>; });
|
||||||
|
bool is_float = xgboost::DispatchDType(
|
||||||
|
array.type, [](auto v) { return std::is_floating_point_v<decltype(v)>; });
|
||||||
|
CHECK(is_int || is_float) << "Internal error: Invalid DType.";
|
||||||
|
CHECK(array.is_contiguous) << "Internal error: Return by XGBoost should be contiguous";
|
||||||
|
|
||||||
|
// Allocate memory in R
|
||||||
|
SEXP out =
|
||||||
|
Rf_protect(is_int ? SafeAllocInteger(array.n, ctoken) : SafeAllocReal(array.n, ctoken));
|
||||||
|
|
||||||
|
xgboost::DispatchDType(array.type, [&](auto t) {
|
||||||
|
using T = decltype(t);
|
||||||
|
auto in_ptr = static_cast<T const *>(array.data);
|
||||||
|
if (is_int) {
|
||||||
|
auto out_ptr = INTEGER(out);
|
||||||
|
std::copy_n(in_ptr, array.n, out_ptr);
|
||||||
|
} else {
|
||||||
|
auto out_ptr = REAL(out);
|
||||||
|
std::copy_n(in_ptr, array.n, out_ptr);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Rf_unprotect(1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
struct RRNGStateController {
|
struct RRNGStateController {
|
||||||
@ -540,6 +597,73 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
|
|||||||
return ScalarInteger(static_cast<int>(ncol));
|
return ScalarInteger(static_cast<int>(ncol));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle) {
|
||||||
|
const char *out_names[] = {"indptr", "data", ""};
|
||||||
|
SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
|
||||||
|
SEXP out = Rf_protect(Rf_mkNamed(VECSXP, out_names));
|
||||||
|
R_API_BEGIN();
|
||||||
|
const char *out_indptr;
|
||||||
|
const char *out_data;
|
||||||
|
CHECK_CALL(XGDMatrixGetQuantileCut(R_ExternalPtrAddr(handle), "{}", &out_indptr, &out_data));
|
||||||
|
try {
|
||||||
|
SET_VECTOR_ELT(out, 0, CopyArrayToR(out_indptr, continuation_token));
|
||||||
|
SET_VECTOR_ELT(out, 1, CopyArrayToR(out_data, continuation_token));
|
||||||
|
} catch (ErrorWithUnwind &e) {
|
||||||
|
R_ContinueUnwind(continuation_token);
|
||||||
|
}
|
||||||
|
R_API_END();
|
||||||
|
Rf_unprotect(2);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGDMatrixNumNonMissing_R(SEXP handle) {
|
||||||
|
SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1));
|
||||||
|
R_API_BEGIN();
|
||||||
|
bst_ulong out_;
|
||||||
|
CHECK_CALL(XGDMatrixNumNonMissing(R_ExternalPtrAddr(handle), &out_));
|
||||||
|
REAL(out)[0] = static_cast<double>(out_);
|
||||||
|
R_API_END();
|
||||||
|
Rf_unprotect(1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle) {
|
||||||
|
const char *out_names[] = {"indptr", "indices", "data", "ncols", ""};
|
||||||
|
SEXP out = Rf_protect(Rf_mkNamed(VECSXP, out_names));
|
||||||
|
R_API_BEGIN();
|
||||||
|
|
||||||
|
bst_ulong nrows, ncols, nnz;
|
||||||
|
CHECK_CALL(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrows));
|
||||||
|
CHECK_CALL(XGDMatrixNumCol(R_ExternalPtrAddr(handle), &ncols));
|
||||||
|
CHECK_CALL(XGDMatrixNumNonMissing(R_ExternalPtrAddr(handle), &nnz));
|
||||||
|
if (std::max(nrows, ncols) > std::numeric_limits<int>::max()) {
|
||||||
|
Rf_error("%s", "Error: resulting DMatrix data does not fit into R 'dgRMatrix'.");
|
||||||
|
}
|
||||||
|
|
||||||
|
SET_VECTOR_ELT(out, 0, Rf_allocVector(INTSXP, nrows + 1));
|
||||||
|
SET_VECTOR_ELT(out, 1, Rf_allocVector(INTSXP, nnz));
|
||||||
|
SET_VECTOR_ELT(out, 2, Rf_allocVector(REALSXP, nnz));
|
||||||
|
SET_VECTOR_ELT(out, 3, Rf_ScalarInteger(ncols));
|
||||||
|
|
||||||
|
std::unique_ptr<bst_ulong[]> indptr(new bst_ulong[nrows + 1]);
|
||||||
|
std::unique_ptr<unsigned[]> indices(new unsigned[nnz]);
|
||||||
|
std::unique_ptr<float[]> data(new float[nnz]);
|
||||||
|
|
||||||
|
CHECK_CALL(XGDMatrixGetDataAsCSR(R_ExternalPtrAddr(handle),
|
||||||
|
"{}",
|
||||||
|
indptr.get(),
|
||||||
|
indices.get(),
|
||||||
|
data.get()));
|
||||||
|
|
||||||
|
std::copy(indptr.get(), indptr.get() + nrows + 1, INTEGER(VECTOR_ELT(out, 0)));
|
||||||
|
std::copy(indices.get(), indices.get() + nnz, INTEGER(VECTOR_ELT(out, 1)));
|
||||||
|
std::copy(data.get(), data.get() + nnz, REAL(VECTOR_ELT(out, 2)));
|
||||||
|
|
||||||
|
R_API_END();
|
||||||
|
Rf_unprotect(1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
// functions related to booster
|
// functions related to booster
|
||||||
void _BoosterFinalizer(SEXP ext) {
|
void _BoosterFinalizer(SEXP ext) {
|
||||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||||
|
|||||||
@ -143,6 +143,31 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
|
|||||||
*/
|
*/
|
||||||
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief return the quantile cuts used for the histogram method
|
||||||
|
* \param handle an instance of data matrix
|
||||||
|
* \return A list with entries 'indptr' and 'data'
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief get the number of non-missing entries in a dmatrix
|
||||||
|
* \param handle an instance of data matrix
|
||||||
|
* \return the number of non-missing entries
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGDMatrixNumNonMissing_R(SEXP handle);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief get the data in a dmatrix in CSR format
|
||||||
|
* \param handle an instance of data matrix
|
||||||
|
* \return R list with the following entries in this order:
|
||||||
|
* - 'indptr
|
||||||
|
* - 'indices
|
||||||
|
* - 'data'
|
||||||
|
* - 'ncol'
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief create xgboost learner
|
* \brief create xgboost learner
|
||||||
* \param dmats a list of dmatrix handles that will be cached
|
* \param dmats a list of dmatrix handles that will be cached
|
||||||
|
|||||||
@ -375,3 +375,62 @@ test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", {
|
|||||||
)
|
)
|
||||||
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
|
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: number of non-missing matches data", {
|
||||||
|
x <- matrix(1:10, nrow = 5)
|
||||||
|
dm1 <- xgb.DMatrix(x)
|
||||||
|
expect_equal(xgb.get.DMatrix.num.non.missing(dm1), 10)
|
||||||
|
|
||||||
|
x[2, 2] <- NA
|
||||||
|
x[4, 1] <- NA
|
||||||
|
dm2 <- xgb.DMatrix(x)
|
||||||
|
expect_equal(xgb.get.DMatrix.num.non.missing(dm2), 8)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: retrieving data as CSR", {
|
||||||
|
data(mtcars)
|
||||||
|
dm <- xgb.DMatrix(as.matrix(mtcars))
|
||||||
|
csr <- xgb.get.DMatrix.data(dm)
|
||||||
|
expect_equal(dim(csr), dim(mtcars))
|
||||||
|
expect_equal(colnames(csr), colnames(mtcars))
|
||||||
|
expect_equal(unname(as.matrix(csr)), unname(as.matrix(mtcars)), tolerance = 1e-6)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: quantile cuts look correct", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars$mpg
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
dm <- xgb.DMatrix(x, label = y)
|
||||||
|
model <- xgb.train(
|
||||||
|
data = dm,
|
||||||
|
params = list(
|
||||||
|
tree_method = "hist",
|
||||||
|
max_bin = 8,
|
||||||
|
nthread = 1
|
||||||
|
),
|
||||||
|
nrounds = 3
|
||||||
|
)
|
||||||
|
qcut_list <- xgb.get.DMatrix.qcut(dm, "list")
|
||||||
|
qcut_arrays <- xgb.get.DMatrix.qcut(dm, "arrays")
|
||||||
|
|
||||||
|
expect_equal(length(qcut_arrays), 2)
|
||||||
|
expect_equal(names(qcut_arrays), c("indptr", "data"))
|
||||||
|
expect_equal(length(qcut_arrays$indptr), ncol(x) + 1)
|
||||||
|
expect_true(min(diff(qcut_arrays$indptr)) > 0)
|
||||||
|
|
||||||
|
col_min <- apply(x, 2, min)
|
||||||
|
col_max <- apply(x, 2, max)
|
||||||
|
|
||||||
|
expect_equal(length(qcut_list), ncol(x))
|
||||||
|
expect_equal(names(qcut_list), colnames(x))
|
||||||
|
lapply(
|
||||||
|
seq(1, ncol(x)),
|
||||||
|
function(col) {
|
||||||
|
cuts <- qcut_list[[col]]
|
||||||
|
expect_true(min(diff(cuts)) > 0)
|
||||||
|
expect_true(col_min[col] > cuts[1])
|
||||||
|
expect_true(col_max[col] < cuts[length(cuts)])
|
||||||
|
expect_true(length(cuts) <= 9)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|||||||
13
src/data/array_interface.cc
Normal file
13
src/data/array_interface.cc
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2019-2024, XGBoost Contributors
|
||||||
|
*/
|
||||||
|
#include "array_interface.h"
|
||||||
|
|
||||||
|
#include "../common/common.h" // for AssertGPUSupport
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
|
void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
|
||||||
|
bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
|
||||||
|
#endif // !defined(XGBOOST_USE_CUDA)
|
||||||
|
} // namespace xgboost
|
||||||
@ -375,11 +375,6 @@ struct ToDType<int64_t> {
|
|||||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
|
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
|
||||||
};
|
};
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_CUDA)
|
|
||||||
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
|
|
||||||
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
|
|
||||||
#endif // !defined(XGBOOST_USE_CUDA)
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief A type erased view over __array_interface__ protocol defined by numpy
|
* \brief A type erased view over __array_interface__ protocol defined by numpy
|
||||||
*
|
*
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user