More robust DMatrix creation from a sparse matrix (#1606)
* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types * [python-package] c-api change for _init_from_csr _init_from_csc * fix spaces * [R-package] adopt the new XGDMatrixCreateFromCSCEx interface * [CORE] redirect old sparse creators to new ones
This commit is contained in:
parent
e06f6a0df7
commit
693ddb860e
@ -27,7 +27,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
|
||||
PACKAGE = "xgboost")
|
||||
cnames <- colnames(data)
|
||||
} else if (class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
|
||||
PACKAGE = "xgboost")
|
||||
cnames <- colnames(data)
|
||||
} else {
|
||||
|
||||
@ -87,30 +87,32 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data) {
|
||||
SEXP data,
|
||||
SEXP num_row) {
|
||||
SEXP ret;
|
||||
R_API_BEGIN();
|
||||
const int *p_indptr = INTEGER(indptr);
|
||||
const int *p_indices = INTEGER(indices);
|
||||
const double *p_data = REAL(data);
|
||||
int nindptr = length(indptr);
|
||||
int ndata = length(data);
|
||||
std::vector<bst_ulong> col_ptr_(nindptr);
|
||||
size_t nindptr = static_cast<size_t>(length(indptr));
|
||||
size_t ndata = static_cast<size_t>(length(data));
|
||||
size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
|
||||
std::vector<size_t> col_ptr_(nindptr);
|
||||
std::vector<unsigned> indices_(ndata);
|
||||
std::vector<float> data_(ndata);
|
||||
|
||||
for (int i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
|
||||
for (size_t i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < ndata; ++i) {
|
||||
for (size_t i = 0; i < ndata; ++i) {
|
||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||
data_[i] = static_cast<float>(p_data[i]);
|
||||
}
|
||||
DMatrixHandle handle;
|
||||
CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||
BeginPtr(data_), nindptr, ndata,
|
||||
&handle));
|
||||
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||
BeginPtr(data_), nindptr, ndata,
|
||||
nrow, &handle));
|
||||
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
|
||||
@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
* \param indptr pointer to column headers
|
||||
* \param indices row indices
|
||||
* \param data content of the data
|
||||
* \param num_row numer of rows (when it's set to 0, then guess from data)
|
||||
* \return created dmatrix
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data);
|
||||
SEXP data,
|
||||
SEXP num_row);
|
||||
|
||||
/*!
|
||||
* \brief create a new dmatrix from sliced content of existing matrix
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
|
||||
context("testing xgb.DMatrix functionality")
|
||||
|
||||
@ -65,3 +66,13 @@ test_that("xgb.DMatrix: colnames", {
|
||||
expect_silent(colnames(dtest) <- NULL)
|
||||
expect_null(colnames(dtest))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
|
||||
set.seed(123)
|
||||
nr <- 1000
|
||||
x <- rsparsematrix(nr, 100, density=0.0005)
|
||||
# we want it very sparse, so that last rows are empty
|
||||
expect_lt(max(x@i), nr)
|
||||
dtest <- xgb.DMatrix(x)
|
||||
expect_equal(dim(dtest), dim(x))
|
||||
})
|
||||
|
||||
@ -114,7 +114,26 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
|
||||
DMatrixHandle *out);
|
||||
|
||||
/*!
|
||||
* \brief create a matrix content from csr format
|
||||
* \brief create a matrix content from CSR format
|
||||
* \param indptr pointer to row headers
|
||||
* \param indices findex
|
||||
* \param data fvalue
|
||||
* \param nindptr number of rows in the matix + 1
|
||||
* \param nelem number of nonzero elements in the matrix
|
||||
* \param num_col number of columns; when it's set to 0, then guess from data
|
||||
* \param out created dmatrix
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_col,
|
||||
DMatrixHandle* out);
|
||||
/*!
|
||||
* \deprecated
|
||||
* \brief create a matrix content from CSR format
|
||||
* \param indptr pointer to row headers
|
||||
* \param indices findex
|
||||
* \param data fvalue
|
||||
@ -130,6 +149,25 @@ XGB_DLL int XGDMatrixCreateFromCSR(const bst_ulong *indptr,
|
||||
bst_ulong nelem,
|
||||
DMatrixHandle *out);
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param col_ptr pointer to col headers
|
||||
* \param indices findex
|
||||
* \param data fvalue
|
||||
* \param nindptr number of rows in the matix + 1
|
||||
* \param nelem number of nonzero elements in the matrix
|
||||
* \param num_row number of rows; when it's set to 0, then guess from data
|
||||
* \param out created dmatrix
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_row,
|
||||
DMatrixHandle* out);
|
||||
/*!
|
||||
* \deprecated
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param col_ptr pointer to col headers
|
||||
* \param indices findex
|
||||
|
||||
@ -287,11 +287,12 @@ class DMatrix(object):
|
||||
if len(csr.indices) != len(csr.data):
|
||||
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
|
||||
self.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGDMatrixCreateFromCSR(c_array(ctypes.c_ulong, csr.indptr),
|
||||
c_array(ctypes.c_uint, csr.indices),
|
||||
c_array(ctypes.c_float, csr.data),
|
||||
len(csr.indptr), len(csr.data),
|
||||
ctypes.byref(self.handle)))
|
||||
_check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr),
|
||||
c_array(ctypes.c_uint, csr.indices),
|
||||
c_array(ctypes.c_float, csr.data),
|
||||
len(csr.indptr), len(csr.data),
|
||||
csr.shape[1],
|
||||
ctypes.byref(self.handle)))
|
||||
|
||||
def _init_from_csc(self, csc):
|
||||
"""
|
||||
@ -300,11 +301,12 @@ class DMatrix(object):
|
||||
if len(csc.indices) != len(csc.data):
|
||||
raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
|
||||
self.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGDMatrixCreateFromCSC(c_array(ctypes.c_ulong, csc.indptr),
|
||||
c_array(ctypes.c_uint, csc.indices),
|
||||
c_array(ctypes.c_float, csc.data),
|
||||
len(csc.indptr), len(csc.data),
|
||||
ctypes.byref(self.handle)))
|
||||
_check_call(_LIB.XGDMatrixCreateFromCSCEx(c_array(ctypes.c_size_t, csc.indptr),
|
||||
c_array(ctypes.c_uint, csc.indices),
|
||||
c_array(ctypes.c_float, csc.data),
|
||||
len(csc.indptr), len(csc.data),
|
||||
csc.shape[0],
|
||||
ctypes.byref(self.handle)))
|
||||
|
||||
def _init_from_npy2d(self, mat, missing):
|
||||
"""
|
||||
|
||||
@ -227,38 +227,58 @@ int XGDMatrixCreateFromDataIter(
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
|
||||
const unsigned *indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_col,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
mat.row_ptr_.resize(nindptr);
|
||||
for (xgboost::bst_ulong i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
for (size_t i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = indptr[i];
|
||||
}
|
||||
mat.row_data_.resize(nelem);
|
||||
for (xgboost::bst_ulong i = 0; i < nelem; ++i) {
|
||||
for (size_t i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
||||
mat.info.num_col = std::max(mat.info.num_col,
|
||||
static_cast<uint64_t>(indices[i] + 1));
|
||||
}
|
||||
if (num_col > 0) {
|
||||
CHECK_LE(mat.info.num_col, num_col);
|
||||
mat.info.num_col = num_col;
|
||||
}
|
||||
mat.info.num_row = nindptr - 1;
|
||||
mat.info.num_nonzero = static_cast<uint64_t>(nelem);
|
||||
mat.info.num_nonzero = nelem;
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
const unsigned* indices,
|
||||
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
|
||||
const unsigned *indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::vector<size_t> indptr_(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
indptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
}
|
||||
return XGDMatrixCreateFromCSREx(&indptr_[0], indices, data,
|
||||
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_row,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
@ -270,31 +290,49 @@ XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
|
||||
builder.InitBudget(0, nthread);
|
||||
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
|
||||
size_t ncol = nindptr - 1; // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(indices[j], tid);
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.Push(indices[j],
|
||||
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
mat.info.num_row = mat.row_ptr_.size() - 1;
|
||||
mat.info.num_col = static_cast<uint64_t>(ncol);
|
||||
if (num_row > 0) {
|
||||
CHECK_LE(mat.info.num_row, num_row);
|
||||
mat.info.num_row = num_row;
|
||||
}
|
||||
mat.info.num_col = ncol;
|
||||
mat.info.num_nonzero = nelem;
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::vector<size_t> col_ptr_(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<size_t>(col_ptr[i]);
|
||||
}
|
||||
return XGDMatrixCreateFromCSCEx(&col_ptr_[0], indices, data,
|
||||
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromMat(const float* data,
|
||||
xgboost::bst_ulong nrow,
|
||||
xgboost::bst_ulong ncol,
|
||||
|
||||
33
tests/python/test_sparse_dmatrix.py
Normal file
33
tests/python/test_sparse_dmatrix.py
Normal file
@ -0,0 +1,33 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from scipy.sparse import rand
|
||||
|
||||
rng = np.random.RandomState(1)
|
||||
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'silent': 1}
|
||||
|
||||
|
||||
def test_sparse_dmatrix_csr():
|
||||
nrow = 100
|
||||
ncol = 1000
|
||||
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
||||
assert x.indices.max() < ncol - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
|
||||
|
||||
def test_sparse_dmatrix_csc():
|
||||
nrow = 1000
|
||||
ncol = 100
|
||||
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
|
||||
assert x.indices.max() < nrow - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
Loading…
x
Reference in New Issue
Block a user