More robust DMatrix creation from a sparse matrix (#1606)

* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types

* [python-package] c-api change for _init_from_csr _init_from_csc

* fix spaces

* [R-package] adopt the new XGDMatrixCreateFromCSCEx interface

* [CORE] redirect old sparse creators to new ones
This commit is contained in:
Vadim Khotilovich
2016-09-25 12:01:22 -05:00
committed by Tianqi Chen
parent e06f6a0df7
commit 693ddb860e
8 changed files with 167 additions and 41 deletions

View File

@@ -227,38 +227,58 @@ int XGDMatrixCreateFromDataIter(
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
const unsigned *indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
API_BEGIN();
data::SimpleCSRSource& mat = *source;
mat.row_ptr_.resize(nindptr);
for (xgboost::bst_ulong i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
for (size_t i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = indptr[i];
}
mat.row_data_.resize(nelem);
for (xgboost::bst_ulong i = 0; i < nelem; ++i) {
for (size_t i = 0; i < nelem; ++i) {
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
mat.info.num_col = std::max(mat.info.num_col,
static_cast<uint64_t>(indices[i] + 1));
}
if (num_col > 0) {
CHECK_LE(mat.info.num_col, num_col);
mat.info.num_col = num_col;
}
mat.info.num_row = nindptr - 1;
mat.info.num_nonzero = static_cast<uint64_t>(nelem);
mat.info.num_nonzero = nelem;
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
const unsigned* indices,
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
const unsigned *indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
std::vector<size_t> indptr_(nindptr);
for (bst_ulong i = 0; i < nindptr; ++i) {
indptr_[i] = static_cast<size_t>(indptr[i]);
}
return XGDMatrixCreateFromCSREx(&indptr_[0], indices, data,
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
const unsigned* indices,
const float* data,
size_t nindptr,
size_t nelem,
size_t num_row,
DMatrixHandle* out) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
API_BEGIN();
@@ -270,31 +290,49 @@ XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
data::SimpleCSRSource& mat = *source;
common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
builder.InitBudget(0, nthread);
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
size_t ncol = nindptr - 1; // NOLINT(*)
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(indices[j], tid);
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.Push(indices[j],
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
tid);
}
}
mat.info.num_row = mat.row_ptr_.size() - 1;
mat.info.num_col = static_cast<uint64_t>(ncol);
if (num_row > 0) {
CHECK_LE(mat.info.num_row, num_row);
mat.info.num_row = num_row;
}
mat.info.num_col = ncol;
mat.info.num_nonzero = nelem;
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
const unsigned* indices,
const float* data,
xgboost::bst_ulong nindptr,
xgboost::bst_ulong nelem,
DMatrixHandle* out) {
std::vector<size_t> col_ptr_(nindptr);
for (bst_ulong i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<size_t>(col_ptr[i]);
}
return XGDMatrixCreateFromCSCEx(&col_ptr_[0], indices, data,
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
}
XGB_DLL int XGDMatrixCreateFromMat(const float* data,
xgboost::bst_ulong nrow,
xgboost::bst_ulong ncol,