More robust DMatrix creation from a sparse matrix (#1606)
* [CORE] DMatrix from sparse w/ explicit #col #row; safer arg types * [python-package] c-api change for _init_from_csr _init_from_csc * fix spaces * [R-package] adopt the new XGDMatrixCreateFromCSCEx interface * [CORE] redirect old sparse creators to new ones
This commit is contained in:
committed by
Tianqi Chen
parent
e06f6a0df7
commit
693ddb860e
@@ -227,38 +227,58 @@ int XGDMatrixCreateFromDataIter(
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
|
||||
const unsigned *indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_col,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
mat.row_ptr_.resize(nindptr);
|
||||
for (xgboost::bst_ulong i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
for (size_t i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = indptr[i];
|
||||
}
|
||||
mat.row_data_.resize(nelem);
|
||||
for (xgboost::bst_ulong i = 0; i < nelem; ++i) {
|
||||
for (size_t i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
||||
mat.info.num_col = std::max(mat.info.num_col,
|
||||
static_cast<uint64_t>(indices[i] + 1));
|
||||
}
|
||||
if (num_col > 0) {
|
||||
CHECK_LE(mat.info.num_col, num_col);
|
||||
mat.info.num_col = num_col;
|
||||
}
|
||||
mat.info.num_row = nindptr - 1;
|
||||
mat.info.num_nonzero = static_cast<uint64_t>(nelem);
|
||||
mat.info.num_nonzero = nelem;
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
const unsigned* indices,
|
||||
XGB_DLL int XGDMatrixCreateFromCSR(const xgboost::bst_ulong* indptr,
|
||||
const unsigned *indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::vector<size_t> indptr_(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
indptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
}
|
||||
return XGDMatrixCreateFromCSREx(&indptr_[0], indices, data,
|
||||
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
size_t nindptr,
|
||||
size_t nelem,
|
||||
size_t num_row,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
@@ -270,31 +290,49 @@ XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
|
||||
builder.InitBudget(0, nthread);
|
||||
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
|
||||
size_t ncol = nindptr - 1; // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(indices[j], tid);
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
for (size_t i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.Push(indices[j],
|
||||
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
mat.info.num_row = mat.row_ptr_.size() - 1;
|
||||
mat.info.num_col = static_cast<uint64_t>(ncol);
|
||||
if (num_row > 0) {
|
||||
CHECK_LE(mat.info.num_row, num_row);
|
||||
mat.info.num_row = num_row;
|
||||
}
|
||||
mat.info.num_col = ncol;
|
||||
mat.info.num_nonzero = nelem;
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSC(const xgboost::bst_ulong* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
xgboost::bst_ulong nindptr,
|
||||
xgboost::bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::vector<size_t> col_ptr_(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<size_t>(col_ptr[i]);
|
||||
}
|
||||
return XGDMatrixCreateFromCSCEx(&col_ptr_[0], indices, data,
|
||||
static_cast<size_t>(nindptr), static_cast<size_t>(nelem), 0, out);
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromMat(const float* data,
|
||||
xgboost::bst_ulong nrow,
|
||||
xgboost::bst_ulong ncol,
|
||||
|
||||
Reference in New Issue
Block a user