More robust DMatrix creation from a sparse matrix (#1606)

* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types

* [python-package] c-api change for _init_from_csr _init_from_csc

* fix spaces

* [R-package] adopt the new XGDMatrixCreateFromCSCEx interface

* [CORE] redirect old sparse creators to new ones
This commit is contained in:
Vadim Khotilovich
2016-09-25 12:01:22 -05:00
committed by Tianqi Chen
parent e06f6a0df7
commit 693ddb860e
8 changed files with 167 additions and 41 deletions

View File

@@ -27,7 +27,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
PACKAGE = "xgboost")
cnames <- colnames(data)
} else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
PACKAGE = "xgboost")
cnames <- colnames(data)
} else {

View File

@@ -87,30 +87,32 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
SEXP data,
SEXP num_row) {
SEXP ret;
R_API_BEGIN();
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data);
std::vector<bst_ulong> col_ptr_(nindptr);
size_t nindptr = static_cast<size_t>(length(indptr));
size_t ndata = static_cast<size_t>(length(data));
size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
std::vector<size_t> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (int i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
for (size_t i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
for (size_t i = 0; i < ndata; ++i) {
indices_[i] = static_cast<unsigned>(p_indices[i]);
data_[i] = static_cast<float>(p_data[i]);
}
DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata,
&handle));
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata,
nrow, &handle));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();

View File

@@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \param num_row numer of rows (when it's set to 0, then guess from data)
* \return created dmatrix
*/
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
SEXP data,
SEXP num_row);
/*!
* \brief create a new dmatrix from sliced content of existing matrix

View File

@@ -1,4 +1,5 @@
require(xgboost)
require(Matrix)
context("testing xgb.DMatrix functionality")
@@ -65,3 +66,13 @@ test_that("xgb.DMatrix: colnames", {
expect_silent(colnames(dtest) <- NULL)
expect_null(colnames(dtest))
})
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
set.seed(123)
nr <- 1000
x <- rsparsematrix(nr, 100, density=0.0005)
# we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr)
dtest <- xgb.DMatrix(x)
expect_equal(dim(dtest), dim(x))
})