More robust DMatrix creation from a sparse matrix (#1606)

* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types

* [python-package] c-api change for _init_from_csr _init_from_csc

* fix spaces

* [R-package] adopt the new XGDMatrixCreateFromCSCEx interface

* [CORE] redirect old sparse creators to new ones
This commit is contained in:
Vadim Khotilovich 2016-09-25 12:01:22 -05:00 committed by Tianqi Chen
parent e06f6a0df7
commit 693ddb860e
8 changed files with 167 additions and 41 deletions

View File

@ -27,7 +27,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
PACKAGE = "xgboost")
cnames <- colnames(data)
} else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
PACKAGE = "xgboost")
cnames <- colnames(data)
} else {

View File

@ -87,30 +87,32 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
SEXP data,
SEXP num_row) {
SEXP ret;
R_API_BEGIN();
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data);
std::vector<bst_ulong> col_ptr_(nindptr);
size_t nindptr = static_cast<size_t>(length(indptr));
size_t ndata = static_cast<size_t>(length(data));
size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
std::vector<size_t> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (int i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
for (size_t i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
for (size_t i = 0; i < ndata; ++i) {
indices_[i] = static_cast<unsigned>(p_indices[i]);
data_[i] = static_cast<float>(p_data[i]);
}
DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata,
&handle));
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata,
nrow, &handle));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();

View File

@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \param num_row numer of rows (when it's set to 0, then guess from data)
* \return created dmatrix
*/
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
SEXP data,
SEXP num_row);
/*!
* \brief create a new dmatrix from sliced content of existing matrix

View File

@ -1,4 +1,5 @@
require(xgboost)
require(Matrix)
context("testing xgb.DMatrix functionality")
@ -65,3 +66,13 @@ test_that("xgb.DMatrix: colnames", {
expect_silent(colnames(dtest) <- NULL)
expect_null(colnames(dtest))
})
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
set.seed(123)
nr <- 1000
x <- rsparsematrix(nr, 100, density=0.0005)
# we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr)
dtest <- xgb.DMatrix(x)
expect_equal(dim(dtest), dim(x))
})

View File

@ -114,7 +114,26 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
DMatrixHandle *out);
/*!
* \brief create a matrix content from csr format
* \brief create a matrix content from CSR format
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param out created dmatrix
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out);
/*!
* \deprecated
* \brief create a matrix content from CSR format
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
@ -130,6 +149,25 @@ XGB_DLL int XGDMatrixCreateFromCSR(const bst_ulong *indptr,
bst_ulong nelem,
DMatrixHandle *out);
/*!
* \brief create a matrix content from CSC format
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param out created dmatrix
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_row,
DMatrixHandle* out);
/*!
* \deprecated
* \brief create a matrix content from CSC format
* \param col_ptr pointer to col headers
* \param indices findex

View File

@ -287,11 +287,12 @@ class DMatrix(object):
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromCSR(c_array(ctypes.c_ulong, csr.indptr),
c_array(ctypes.c_uint, csr.indices),
c_array(ctypes.c_float, csr.data),
len(csr.indptr), len(csr.data),
ctypes.byref(self.handle)))
_check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr),
c_array(ctypes.c_uint, csr.indices),
c_array(ctypes.c_float, csr.data),
len(csr.indptr), len(csr.data),
csr.shape[1],
ctypes.byref(self.handle)))
def _init_from_csc(self, csc):
"""
@ -300,11 +301,12 @@ class DMatrix(object):
if len(csc.indices) != len(csc.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromCSC(c_array(ctypes.c_ulong, csc.indptr),
c_array(ctypes.c_uint, csc.indices),
c_array(ctypes.c_float, csc.data),
len(csc.indptr), len(csc.data),
ctypes.byref(self.handle)))
_check_call(_LIB.XGDMatrixCreateFromCSCEx(c_array(ctypes.c_size_t, csc.indptr),
c_array(ctypes.c_uint, csc.indices),
c_array(ctypes.c_float, csc.data),
len(csc.indptr), len(csc.data),
csc.shape[0],
ctypes.byref(self.handle)))
def _init_from_npy2d(self, mat, missing):
"""

View File

@ -227,38 +227,58 @@ int XGDMatrixCreateFromDataIter(
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
const unsigned *indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
API_BEGIN();
data::SimpleCSRSource& mat = *source;
mat.row_ptr_.resize(nindptr);
for (xgboost::bst_ulong i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
for (size_t i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = indptr[i];
}
mat.row_data_.resize(nelem);
for (xgboost::bst_ulong i = 0; i < nelem; ++i) {
for (size_t i = 0; i < nelem; ++i) {
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
mat.info.num_col = std::max(mat.info.num_col,
static_cast<uint64_t>(indices[i] + 1));
}
if (num_col > 0) {
CHECK_LE(mat.info.num_col, num_col);
mat.info.num_col = num_col;
}
mat.info.num_row = nindptr - 1;
mat.info.num_nonzero = static_cast<uint64_t>(nelem);
mat.info.num_nonzero = nelem;
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
const unsigned* indices,
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
const unsigned *indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
std::vector<size_t> indptr_(nindptr);
for (bst_ulong i = 0; i < nindptr; ++i) {
indptr_[i] = static_cast<size_t>(indptr[i]);
}
return XGDMatrixCreateFromCSREx(&indptr_[0], indices, data,
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_row,
DMatrixHandle* out) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
API_BEGIN();
@ -270,31 +290,49 @@ XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
data::SimpleCSRSource& mat = *source;
common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
builder.InitBudget(0, nthread);
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
size_t ncol = nindptr - 1; // NOLINT(*)
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(indices[j], tid);
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.Push(indices[j],
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
tid);
}
}
mat.info.num_row = mat.row_ptr_.size() - 1;
mat.info.num_col = static_cast<uint64_t>(ncol);
if (num_row > 0) {
CHECK_LE(mat.info.num_row, num_row);
mat.info.num_row = num_row;
}
mat.info.num_col = ncol;
mat.info.num_nonzero = nelem;
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
const unsigned* indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
std::vector<size_t> col_ptr_(nindptr);
for (bst_ulong i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<size_t>(col_ptr[i]);
}
return XGDMatrixCreateFromCSCEx(&col_ptr_[0], indices, data,
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
}
XGB_DLL int XGDMatrixCreateFromMat(const float* data,
xgboost::bst_ulong nrow,
xgboost::bst_ulong ncol,

View File

@ -0,0 +1,33 @@
import numpy as np
import xgboost as xgb
from scipy.sparse import rand
rng = np.random.RandomState(1)
param = {'max_depth': 3, 'objective': 'binary:logistic', 'silent': 1}
def test_sparse_dmatrix_csr():
nrow = 100
ncol = 1000
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
assert x.indices.max() < ncol - 1
x.data[:] = 1
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, 'train')]
bst = xgb.train(param, dtrain, 5, watchlist)
bst.predict(dtrain)
def test_sparse_dmatrix_csc():
nrow = 1000
ncol = 100
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
assert x.indices.max() < nrow - 1
x.data[:] = 1
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, 'train')]
bst = xgb.train(param, dtrain, 5, watchlist)
bst.predict(dtrain)