diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index a7753dfa5..bbb99615a 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -7,7 +7,6 @@ #include "wrapper/xgboost_wrapper.h" #include "src/utils/utils.h" #include "src/utils/omp.h" -#include "src/utils/matrix_csr.h" using namespace std; using namespace xgboost; @@ -91,37 +90,25 @@ extern "C" { SEXP indices, SEXP data) { _WrapperBegin(); - const int *col_ptr = INTEGER(indptr); - const int *row_index = INTEGER(indices); - const double *col_data = REAL(data); - int ncol = length(indptr) - 1; + const int *p_indptr = INTEGER(indptr); + const int *p_indices = INTEGER(indices); + const double *p_data = REAL(data); + int nindptr = length(indptr); int ndata = length(data); - // transform into CSR format - std::vector row_ptr; - std::vector< std::pair > csr_data; - utils::SparseCSRMBuilder, false, bst_ulong> builder(row_ptr, csr_data); - builder.InitBudget(); - for (int i = 0; i < ncol; ++i) { - for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.AddBudget(row_index[j]); - } + std::vector col_ptr_(nindptr); + std::vector indices_(ndata); + std::vector data_(ndata); + + for (int i = 0; i < nindptr; ++i) { + col_ptr_[i] = static_cast(p_indptr[i]); } - builder.InitStorage(); - for (int i = 0; i < ncol; ++i) { - for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.PushElem(row_index[j], std::make_pair(i, col_data[j])); - } - } - utils::Assert(csr_data.size() == static_cast(ndata), "BUG CreateFromCSC"); - std::vector row_data(ndata); - std::vector col_index(ndata); #pragma omp parallel for schedule(static) for (int i = 0; i < ndata; ++i) { - col_index[i] = csr_data[i].first; - row_data[i] = csr_data[i].second; + indices_[i] = static_cast(p_indices[i]); + data_[i] = static_cast(p_data[i]); } - void *handle = XGDMatrixCreateFromCSR(BeginPtr(row_ptr), BeginPtr(col_index), - BeginPtr(row_data), row_ptr.size(), ndata ); + void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_), + BeginPtr(data_), nindptr, ndata); SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); UNPROTECT(1); diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py index 62c3fc010..81b35ab45 100755 --- a/demo/guide-python/basic_walkthrough.py +++ b/demo/guide-python/basic_walkthrough.py @@ -42,7 +42,7 @@ assert np.sum(np.abs(preds2-preds)) == 0 ### # build dmatrix from scipy.sparse -print ('start running example of build DMatrix from scipy.sparse') +print ('start running example of build DMatrix from scipy.sparse CSR Matrix') labels = [] row = []; col = []; dat = [] i = 0 @@ -54,8 +54,14 @@ for l in open('../data/agaricus.txt.train'): row.append(i); col.append(int(k)); dat.append(float(v)) i += 1 csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) -dtrain = xgb.DMatrix( csr ) -dtrain.set_label(labels) +dtrain = xgb.DMatrix( csr, label = labels ) +watchlist = [(dtest,'eval'), (dtrain,'train')] +bst = xgb.train( param, dtrain, num_round, watchlist ) + +print ('start running example of build DMatrix from scipy.sparse CSC Matrix') +# we can also construct from csc matrix +csc = scipy.sparse.csc_matrix( (dat, (row,col)) ) +dtrain = xgb.DMatrix(csc, label=labels) watchlist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, watchlist ) @@ -63,8 +69,7 @@ print ('start running example of build DMatrix from numpy array') # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation # then convert to DMatrix npymat = csr.todense() -dtrain = xgb.DMatrix( npymat) -dtrain.set_label(labels) +dtrain = xgb.DMatrix(npymat, label = labels) watchlist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, watchlist ) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 86a829d4d..b8eb773d7 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -22,6 +22,7 @@ xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH) # DMatrix functions xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p +xglib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float) @@ -66,6 +67,8 @@ class DMatrix: xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0)) elif isinstance(data, scp.csr_matrix): self.__init_from_csr(data) + elif isinstance(data, scp.csc_matrix): + self.__init_from_csc(data) elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: self.__init_from_npy2d(data, missing) else: @@ -88,6 +91,15 @@ class DMatrix: (ctypes.c_float * len(csr.data))(*csr.data), len(csr.indptr), len(csr.data))) + def __init_from_csc(self, csc): + """convert data from csr matrix""" + assert len(csc.indices) == len(csc.data) + self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC( + (ctypes.c_ulong * len(csc.indptr))(*csc.indptr), + (ctypes.c_uint * len(csc.indices))(*csc.indices), + (ctypes.c_float * len(csc.data))(*csc.data), + len(csc.indptr), len(csc.data))) + def __init_from_npy2d(self,mat,missing): """convert data from numpy matrix""" data = numpy.array(mat.reshape(mat.size), dtype='float32') diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index abb844bce..2bd734058 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -14,6 +14,7 @@ using namespace std; #include "../src/learner/learner-inl.hpp" #include "../src/io/io.h" #include "../src/utils/utils.h" +#include "../src/utils/matrix_csr.h" #include "../src/io/simple_dmatrix-inl.hpp" using namespace xgboost; @@ -102,6 +103,31 @@ extern "C"{ mat.info.info.num_row = nindptr - 1; return p_mat; } + XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem) { + DMatrixSimple *p_mat = new DMatrixSimple(); + DMatrixSimple &mat = *p_mat; + utils::SparseCSRMBuilder builder(mat.row_ptr_, mat.row_data_); + builder.InitBudget(); + bst_ulong ncol = nindptr - 1; + for (bst_ulong i = 0; i < ncol; ++i) { + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.AddBudget(indices[j]); + } + } + builder.InitStorage(); + for (bst_ulong i = 0; i < ncol; ++i) { + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.PushElem(indices[j], RowBatch::Entry(static_cast(i), data[j])); + } + } + mat.info.info.num_row = mat.row_ptr_.size() - 1; + mat.info.info.num_col = static_cast(ncol); + return p_mat; + } void* XGDMatrixCreateFromMat(const float *data, bst_ulong nrow, bst_ulong ncol, diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 9687ec0a3..0dd1f3606 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -22,7 +22,7 @@ extern "C" { * \return a loaded data matrix */ XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent); - /*! + /*! * \brief create a matrix content from csr format * \param indptr pointer to row headers * \param indices findex @@ -36,6 +36,20 @@ extern "C" { const float *data, bst_ulong nindptr, bst_ulong nelem); + /*! + * \brief create a matrix content from CSC format + * \param col_ptr pointer to col headers + * \param indices findex + * \param data fvalue + * \param nindptr number of rows in the matix + 1 + * \param nelem number of nonzero elements in the matrix + * \return created dmatrix + */ + XGB_DLL void* XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, + const unsigned *indices, + const float *data, + bst_ulong nindptr, + bst_ulong nelem); /*! * \brief create matrix content from dense matrix * \param data pointer to the data space