[R] Use array interface for dense DMatrix creation (#9816)
--------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
parent
da3d55db5b
commit
37da66f865
@ -8,6 +8,7 @@
|
|||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
#include <xgboost/logging.h>
|
#include <xgboost/logging.h>
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -21,6 +22,67 @@
|
|||||||
|
|
||||||
#include "./xgboost_R.h" // Must follow other includes.
|
#include "./xgboost_R.h" // Must follow other includes.
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
|
||||||
|
SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
|
||||||
|
const int *ptr_mat_dims = INTEGER(mat_dims);
|
||||||
|
|
||||||
|
// Lambda for type dispatch.
|
||||||
|
auto make_matrix = [=](auto const *ptr) {
|
||||||
|
using namespace xgboost; // NOLINT
|
||||||
|
using T = std::remove_pointer_t<decltype(ptr)>;
|
||||||
|
|
||||||
|
auto m = linalg::MatrixView<T>{
|
||||||
|
common::Span{ptr,
|
||||||
|
static_cast<std::size_t>(ptr_mat_dims[0]) * static_cast<std::size_t>(ptr_mat_dims[1])},
|
||||||
|
{ptr_mat_dims[0], ptr_mat_dims[1]}, // Shape
|
||||||
|
DeviceOrd::CPU(),
|
||||||
|
linalg::Order::kF // R uses column-major
|
||||||
|
};
|
||||||
|
CHECK(m.FContiguous());
|
||||||
|
return linalg::ArrayInterfaceStr(m);
|
||||||
|
};
|
||||||
|
|
||||||
|
const SEXPTYPE arr_type = TYPEOF(R_mat);
|
||||||
|
switch (arr_type) {
|
||||||
|
case REALSXP:
|
||||||
|
return make_matrix(REAL(R_mat));
|
||||||
|
case INTSXP:
|
||||||
|
return make_matrix(INTEGER(R_mat));
|
||||||
|
case LGLSXP:
|
||||||
|
return make_matrix(LOGICAL(R_mat));
|
||||||
|
default:
|
||||||
|
LOG(FATAL) << "Array or matrix has unsupported type.";
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(FATAL) << "Not reachable";
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
|
||||||
|
using namespace ::xgboost; // NOLINT
|
||||||
|
Json jconfig{Object{}};
|
||||||
|
|
||||||
|
const SEXPTYPE missing_type = TYPEOF(missing);
|
||||||
|
if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
|
||||||
|
(missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
|
||||||
|
(missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
|
||||||
|
// missing is not specified
|
||||||
|
if (arr_type == REALSXP) {
|
||||||
|
jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
|
||||||
|
} else {
|
||||||
|
jconfig["missing"] = R_NaInt;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// missing specified
|
||||||
|
jconfig["missing"] = Rf_asReal(missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
jconfig["nthread"] = Rf_asInteger(n_threads);
|
||||||
|
return Json::Dump(jconfig);
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief macro to annotate begin of api
|
* \brief macro to annotate begin of api
|
||||||
*/
|
*/
|
||||||
@ -94,47 +156,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
|
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
|
||||||
SEXP ret;
|
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||||
R_API_BEGIN();
|
R_API_BEGIN();
|
||||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
|
||||||
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
|
|
||||||
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
|
|
||||||
const bool is_int = TYPEOF(mat) == INTSXP;
|
|
||||||
double *din;
|
|
||||||
int *iin;
|
|
||||||
if (is_int) {
|
|
||||||
iin = INTEGER(mat);
|
|
||||||
} else {
|
|
||||||
din = REAL(mat);
|
|
||||||
}
|
|
||||||
std::vector<float> data(nrow * ncol);
|
|
||||||
xgboost::Context ctx;
|
|
||||||
ctx.nthread = asInteger(n_threads);
|
|
||||||
std::int32_t threads = ctx.Threads();
|
|
||||||
|
|
||||||
if (is_int) {
|
auto array_str = MakeArrayInterfaceFromRMat(mat);
|
||||||
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
|
auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));
|
||||||
for (size_t j = 0; j < ncol; ++j) {
|
|
||||||
auto v = iin[i + nrow * j];
|
|
||||||
if (v == NA_INTEGER) {
|
|
||||||
data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
|
|
||||||
} else {
|
|
||||||
data[i * ncol + j] = static_cast<float>(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
|
|
||||||
for (size_t j = 0; j < ncol; ++j) {
|
|
||||||
data[i * ncol + j] = din[i + nrow * j];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
DMatrixHandle handle;
|
DMatrixHandle handle;
|
||||||
CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
|
CHECK_CALL(XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle));
|
||||||
asReal(missing), &handle, threads));
|
|
||||||
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
R_SetExternalPtrAddr(ret, handle);
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
R_API_END();
|
R_API_END();
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
|||||||
@ -265,3 +265,35 @@ test_that("xgb.DMatrix: print", {
|
|||||||
})
|
})
|
||||||
expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: NA colnames: no")
|
expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: NA colnames: no")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: Inf as missing", {
|
||||||
|
x_inf <- matrix(as.numeric(1:10), nrow = 5)
|
||||||
|
x_inf[2, 1] <- Inf
|
||||||
|
|
||||||
|
x_nan <- x_inf
|
||||||
|
x_nan[2, 1] <- NA_real_
|
||||||
|
|
||||||
|
m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf)
|
||||||
|
xgb.DMatrix.save(m_inf, "inf.dmatrix")
|
||||||
|
|
||||||
|
m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_)
|
||||||
|
xgb.DMatrix.save(m_nan, "nan.dmatrix")
|
||||||
|
|
||||||
|
infconn <- file("inf.dmatrix", "rb")
|
||||||
|
nanconn <- file("nan.dmatrix", "rb")
|
||||||
|
|
||||||
|
expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix"))
|
||||||
|
|
||||||
|
bytes <- file.size("inf.dmatrix")
|
||||||
|
infdmatrix <- readBin(infconn, "raw", n = bytes)
|
||||||
|
nandmatrix <- readBin(nanconn, "raw", n = bytes)
|
||||||
|
|
||||||
|
expect_equal(length(infdmatrix), length(nandmatrix))
|
||||||
|
expect_equal(infdmatrix, nandmatrix)
|
||||||
|
|
||||||
|
close(infconn)
|
||||||
|
close(nanconn)
|
||||||
|
|
||||||
|
file.remove("inf.dmatrix")
|
||||||
|
file.remove("nan.dmatrix")
|
||||||
|
})
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user