diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index a6420d5ff..732c5c726 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -27,7 +27,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { PACKAGE = "xgboost") cnames <- colnames(data) } else if (class(data) == "dgCMatrix") { - handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, + handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data), PACKAGE = "xgboost") cnames <- colnames(data) } else { diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index bedd20aab..7d6ec26ee 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -87,30 +87,32 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, - SEXP data) { + SEXP data, + SEXP num_row) { SEXP ret; R_API_BEGIN(); const int *p_indptr = INTEGER(indptr); const int *p_indices = INTEGER(indices); const double *p_data = REAL(data); - int nindptr = length(indptr); - int ndata = length(data); - std::vector col_ptr_(nindptr); + size_t nindptr = static_cast(length(indptr)); + size_t ndata = static_cast(length(data)); + size_t nrow = static_cast(INTEGER(num_row)[0]); + std::vector col_ptr_(nindptr); std::vector indices_(ndata); std::vector data_(ndata); - for (int i = 0; i < nindptr; ++i) { - col_ptr_[i] = static_cast(p_indptr[i]); + for (size_t i = 0; i < nindptr; ++i) { + col_ptr_[i] = static_cast(p_indptr[i]); } #pragma omp parallel for schedule(static) - for (int i = 0; i < ndata; ++i) { + for (size_t i = 0; i < ndata; ++i) { indices_[i] = static_cast(p_indices[i]); data_[i] = static_cast(p_data[i]); } DMatrixHandle handle; - CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), - BeginPtr(data_), nindptr, ndata, - &handle)); + CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_), + BeginPtr(data_), nindptr, ndata, + nrow, &handle)); ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_API_END(); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index b78a69b67..24c9b78f1 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, * \param indptr pointer to column headers * \param indices row indices * \param data content of the data + * \param num_row numer of rows (when it's set to 0, then guess from data) * \return created dmatrix */ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, - SEXP data); + SEXP data, + SEXP num_row); /*! * \brief create a new dmatrix from sliced content of existing matrix diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index baead5099..0aea2f0a7 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -1,4 +1,5 @@ require(xgboost) +require(Matrix) context("testing xgb.DMatrix functionality") @@ -65,3 +66,13 @@ test_that("xgb.DMatrix: colnames", { expect_silent(colnames(dtest) <- NULL) expect_null(colnames(dtest)) }) + +test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", { + set.seed(123) + nr <- 1000 + x <- rsparsematrix(nr, 100, density=0.0005) + # we want it very sparse, so that last rows are empty + expect_lt(max(x@i), nr) + dtest <- xgb.DMatrix(x) + expect_equal(dim(dtest), dim(x)) +}) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index eeec464b0..464fa280c 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -114,7 +114,26 @@ XGB_DLL int XGDMatrixCreateFromDataIter( DMatrixHandle *out); /*! - * \brief create a matrix content from csr format + * \brief create a matrix content from CSR format + * \param indptr pointer to row headers + * \param indices findex + * \param data fvalue + * \param nindptr number of rows in the matix + 1 + * \param nelem number of nonzero elements in the matrix + * \param num_col number of columns; when it's set to 0, then guess from data + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, + const unsigned* indices, + const float* data, + size_t nindptr, + size_t nelem, + size_t num_col, + DMatrixHandle* out); +/*! + * \deprecated + * \brief create a matrix content from CSR format * \param indptr pointer to row headers * \param indices findex * \param data fvalue @@ -130,6 +149,25 @@ XGB_DLL int XGDMatrixCreateFromCSR(const bst_ulong *indptr, bst_ulong nelem, DMatrixHandle *out); /*! + * \brief create a matrix content from CSC format + * \param col_ptr pointer to col headers + * \param indices findex + * \param data fvalue + * \param nindptr number of rows in the matix + 1 + * \param nelem number of nonzero elements in the matrix + * \param num_row number of rows; when it's set to 0, then guess from data + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, + const unsigned* indices, + const float* data, + size_t nindptr, + size_t nelem, + size_t num_row, + DMatrixHandle* out); +/*! + * \deprecated * \brief create a matrix content from CSC format * \param col_ptr pointer to col headers * \param indices findex diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index e31f622cf..64c598177 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -287,11 +287,12 @@ class DMatrix(object): if len(csr.indices) != len(csr.data): raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) self.handle = ctypes.c_void_p() - _check_call(_LIB.XGDMatrixCreateFromCSR(c_array(ctypes.c_ulong, csr.indptr), - c_array(ctypes.c_uint, csr.indices), - c_array(ctypes.c_float, csr.data), - len(csr.indptr), len(csr.data), - ctypes.byref(self.handle))) + _check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr), + c_array(ctypes.c_uint, csr.indices), + c_array(ctypes.c_float, csr.data), + len(csr.indptr), len(csr.data), + csr.shape[1], + ctypes.byref(self.handle))) def _init_from_csc(self, csc): """ @@ -300,11 +301,12 @@ class DMatrix(object): if len(csc.indices) != len(csc.data): raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) self.handle = ctypes.c_void_p() - _check_call(_LIB.XGDMatrixCreateFromCSC(c_array(ctypes.c_ulong, csc.indptr), - c_array(ctypes.c_uint, csc.indices), - c_array(ctypes.c_float, csc.data), - len(csc.indptr), len(csc.data), - ctypes.byref(self.handle))) + _check_call(_LIB.XGDMatrixCreateFromCSCEx(c_array(ctypes.c_size_t, csc.indptr), + c_array(ctypes.c_uint, csc.indices), + c_array(ctypes.c_float, csc.data), + len(csc.indptr), len(csc.data), + csc.shape[0], + ctypes.byref(self.handle))) def _init_from_npy2d(self, mat, missing): """ diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 857c9c169..9ae38c67b 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -227,38 +227,58 @@ int XGDMatrixCreateFromDataIter( API_END(); } -XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr, - const unsigned *indices, - const float* data, - xgboost::bst_ulong nindptr, - xgboost::bst_ulong nelem, - DMatrixHandle* out) { +XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, + const unsigned* indices, + const float* data, + size_t nindptr, + size_t nelem, + size_t num_col, + DMatrixHandle* out) { std::unique_ptr source(new data::SimpleCSRSource()); API_BEGIN(); data::SimpleCSRSource& mat = *source; mat.row_ptr_.resize(nindptr); - for (xgboost::bst_ulong i = 0; i < nindptr; ++i) { - mat.row_ptr_[i] = static_cast(indptr[i]); + for (size_t i = 0; i < nindptr; ++i) { + mat.row_ptr_[i] = indptr[i]; } mat.row_data_.resize(nelem); - for (xgboost::bst_ulong i = 0; i < nelem; ++i) { + for (size_t i = 0; i < nelem; ++i) { mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); mat.info.num_col = std::max(mat.info.num_col, static_cast(indices[i] + 1)); } + if (num_col > 0) { + CHECK_LE(mat.info.num_col, num_col); + mat.info.num_col = num_col; + } mat.info.num_row = nindptr - 1; - mat.info.num_nonzero = static_cast(nelem); + mat.info.num_nonzero = nelem; *out = new std::shared_ptr(DMatrix::Create(std::move(source))); API_END(); } -XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr, - const unsigned* indices, +XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr, + const unsigned *indices, const float* data, xgboost::bst_ulong nindptr, xgboost::bst_ulong nelem, DMatrixHandle* out) { + std::vector indptr_(nindptr); + for (bst_ulong i = 0; i < nindptr; ++i) { + indptr_[i] = static_cast(indptr[i]); + } + return XGDMatrixCreateFromCSREx(&indptr_[0], indices, data, + static_cast(nindptr), static_cast(nelem), 0, out); +} + +XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, + const unsigned* indices, + const float* data, + size_t nindptr, + size_t nelem, + size_t num_row, + DMatrixHandle* out) { std::unique_ptr source(new data::SimpleCSRSource()); API_BEGIN(); @@ -270,31 +290,49 @@ XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr, data::SimpleCSRSource& mat = *source; common::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); builder.InitBudget(0, nthread); - long ncol = static_cast(nindptr - 1); // NOLINT(*) + size_t ncol = nindptr - 1; // NOLINT(*) #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { // NOLINT(*) + for (size_t i = 0; i < ncol; ++i) { // NOLINT(*) int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) { builder.AddBudget(indices[j], tid); } } builder.InitStorage(); #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { // NOLINT(*) + for (size_t i = 0; i < ncol; ++i) { // NOLINT(*) int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) { builder.Push(indices[j], RowBatch::Entry(static_cast(i), data[j]), tid); } } mat.info.num_row = mat.row_ptr_.size() - 1; - mat.info.num_col = static_cast(ncol); + if (num_row > 0) { + CHECK_LE(mat.info.num_row, num_row); + mat.info.num_row = num_row; + } + mat.info.num_col = ncol; mat.info.num_nonzero = nelem; *out = new std::shared_ptr(DMatrix::Create(std::move(source))); API_END(); } +XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr, + const unsigned* indices, + const float* data, + xgboost::bst_ulong nindptr, + xgboost::bst_ulong nelem, + DMatrixHandle* out) { + std::vector col_ptr_(nindptr); + for (bst_ulong i = 0; i < nindptr; ++i) { + col_ptr_[i] = static_cast(col_ptr[i]); + } + return XGDMatrixCreateFromCSCEx(&col_ptr_[0], indices, data, + static_cast(nindptr), static_cast(nelem), 0, out); +} + XGB_DLL int XGDMatrixCreateFromMat(const float* data, xgboost::bst_ulong nrow, xgboost::bst_ulong ncol, diff --git a/tests/python/test_sparse_dmatrix.py b/tests/python/test_sparse_dmatrix.py new file mode 100644 index 000000000..26c50461f --- /dev/null +++ b/tests/python/test_sparse_dmatrix.py @@ -0,0 +1,33 @@ +import numpy as np +import xgboost as xgb +from scipy.sparse import rand + +rng = np.random.RandomState(1) + +param = {'max_depth': 3, 'objective': 'binary:logistic', 'silent': 1} + + +def test_sparse_dmatrix_csr(): + nrow = 100 + ncol = 1000 + x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) + assert x.indices.max() < ncol - 1 + x.data[:] = 1 + dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) + watchlist = [(dtrain, 'train')] + bst = xgb.train(param, dtrain, 5, watchlist) + bst.predict(dtrain) + + +def test_sparse_dmatrix_csc(): + nrow = 1000 + ncol = 100 + x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng) + assert x.indices.max() < nrow - 1 + x.data[:] = 1 + dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) + watchlist = [(dtrain, 'train')] + bst = xgb.train(param, dtrain, 5, watchlist) + bst.predict(dtrain)