some initial try of cachefiles

This commit is contained in:
tqchen
2015-04-15 15:15:23 -07:00
parent 3d8431fc5c
commit e8f6f3b541
13 changed files with 185 additions and 431 deletions

View File

@@ -87,27 +87,39 @@ def c_array(ctype, values):
class DMatrix(object):
def __init__(self, data, label=None, missing=0.0, weight=None):
def __init__(self, data, label=None, missing=0.0, weight=None, cache_file=None):
"""
Data matrix used in XGBoost.
Parameters
----------
data : string/numpy array/scipy.sparse
Data source, string type is the path of svmlight format txt file or xgb buffer.
Data source, string type is the path of svmlight format txt file,
xgb buffer or path to cache_file
label : list or numpy 1-D array (optional)
Label of the training data.
missing : float
Value in the data which needs to be present as a missing value.
weight : list or numpy 1-D array (optional)
Weight for each instance.
cache_file: string
Path to the binary cache of input data, when this is enabled,
several binary cache files with the prefix cache_file will be created,
xgboost will try to use external memory as much as possible,
thus save memory during computation in general
"""
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None
return
if isinstance(data, string_types):
if cache_file is not None:
if not isinstance(data, string_types):
raise Exception('cache_file must be used together with input file name')
if not isinstance(cache_file, string_types):
raise Exception('cache_file must be string')
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateCache(c_str(data), c_str(cache_file), 0))
elif isinstance(data, string_types):
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0))
elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data)

View File

@@ -114,6 +114,11 @@ extern "C"{
void* XGDMatrixCreateFromFile(const char *fname, int silent) {
return LoadDataMatrix(fname, silent != 0, false, false);
}
void* XGDMatrixCreateCache(const char *fname,
const char *cache_file,
int silent) {
return LoadDataMatrix(fname, silent != 0, false, false, cache_file);
}
void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,

View File

@@ -19,9 +19,22 @@ extern "C" {
#endif
/*!
* \brief load a data matrix
* \param fname the name of the file
* \param silent whether print messages during loading
* \return a loaded data matrix
*/
XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);
/*!
* \brief load a cached DMatrix, this is backed by several cache_files
* and usually cost less memory
* \param fname the name of the file, can be a cached buffer or text
* \param cache_file the name of cached file
* \param silent whether print messages during loading
* \return a loaded data matrix
*/
XGB_DLL void* XGDMatrixCreateCache(const char *fname,
const char *cache_file,
int silent);
/*!
* \brief create a matrix content from csr format
* \param indptr pointer to row headers