More robust DMatrix creation from a sparse matrix (#1606)
* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types * [python-package] c-api change for _init_from_csr _init_from_csc * fix spaces * [R-package] adopt the new XGDMatrixCreateFromCSCEx interface * [CORE] redirect old sparse creators to new ones
This commit is contained in:
committed by
Tianqi Chen
parent
e06f6a0df7
commit
693ddb860e
@@ -27,7 +27,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
|
||||
PACKAGE = "xgboost")
|
||||
cnames <- colnames(data)
|
||||
} else if (class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
|
||||
PACKAGE = "xgboost")
|
||||
cnames <- colnames(data)
|
||||
} else {
|
||||
|
||||
@@ -87,30 +87,32 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data) {
|
||||
SEXP data,
|
||||
SEXP num_row) {
|
||||
SEXP ret;
|
||||
R_API_BEGIN();
|
||||
const int *p_indptr = INTEGER(indptr);
|
||||
const int *p_indices = INTEGER(indices);
|
||||
const double *p_data = REAL(data);
|
||||
int nindptr = length(indptr);
|
||||
int ndata = length(data);
|
||||
std::vector<bst_ulong> col_ptr_(nindptr);
|
||||
size_t nindptr = static_cast<size_t>(length(indptr));
|
||||
size_t ndata = static_cast<size_t>(length(data));
|
||||
size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
|
||||
std::vector<size_t> col_ptr_(nindptr);
|
||||
std::vector<unsigned> indices_(ndata);
|
||||
std::vector<float> data_(ndata);
|
||||
|
||||
for (int i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
|
||||
for (size_t i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < ndata; ++i) {
|
||||
for (size_t i = 0; i < ndata; ++i) {
|
||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||
data_[i] = static_cast<float>(p_data[i]);
|
||||
}
|
||||
DMatrixHandle handle;
|
||||
CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||
BeginPtr(data_), nindptr, ndata,
|
||||
&handle));
|
||||
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||
BeginPtr(data_), nindptr, ndata,
|
||||
nrow, &handle));
|
||||
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
|
||||
@@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
* \param indptr pointer to column headers
|
||||
* \param indices row indices
|
||||
* \param data content of the data
|
||||
* \param num_row numer of rows (when it's set to 0, then guess from data)
|
||||
* \return created dmatrix
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data);
|
||||
SEXP data,
|
||||
SEXP num_row);
|
||||
|
||||
/*!
|
||||
* \brief create a new dmatrix from sliced content of existing matrix
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
|
||||
context("testing xgb.DMatrix functionality")
|
||||
|
||||
@@ -65,3 +66,13 @@ test_that("xgb.DMatrix: colnames", {
|
||||
expect_silent(colnames(dtest) <- NULL)
|
||||
expect_null(colnames(dtest))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
|
||||
set.seed(123)
|
||||
nr <- 1000
|
||||
x <- rsparsematrix(nr, 100, density=0.0005)
|
||||
# we want it very sparse, so that last rows are empty
|
||||
expect_lt(max(x@i), nr)
|
||||
dtest <- xgb.DMatrix(x)
|
||||
expect_equal(dim(dtest), dim(x))
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user